From 0ea31c3888088e19c5c2621db6cbf0c04a34dbb3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 16:51:44 -0600 Subject: [PATCH 001/326] Add sptrsv execution space overloads --- docs/developer/apidocs/sparse.rst | 13 + .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 179 +++++----- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 326 +++++++++++------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 38 +- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 57 +-- .../KokkosSparse_sptrsv_symbolic_spec.hpp | 22 +- sparse/src/KokkosSparse_sptrsv.hpp | 253 ++++++++++++-- 7 files changed, 599 insertions(+), 289 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 415f72eec8..3a55e50c8b 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -94,3 +94,16 @@ par_ilut gmres ----- .. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) + +sptrsv +------ +.. doxygenfunction:: sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7605f03fa2..0b9afa7796 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -22,10 +22,11 @@ namespace KokkosSparse { namespace Impl { -template -void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, +void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -61,6 +62,9 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int64_t nnz = static_cast(entries.extent(0)); size_t pBufferSize; void *rm; @@ -98,13 +102,13 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); // Create dummy dense vector B (RHS) - nnz_scalar_view_t b_dummy("b_dummy", nrows); + nnz_scalar_view_t b_dummy(Kokkos::view_alloc(space, "b_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecBDescr_dummy), static_cast(nrows), b_dummy.data(), cudaValueType)); // Create dummy dense vector X (LHS) - nnz_scalar_view_t x_dummy("x_dummy", nrows); + nnz_scalar_view_t x_dummy(Kokkos::view_alloc(space, "x_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecXDescr_dummy), static_cast(nrows), x_dummy.data(), cudaValueType)); @@ -163,9 +167,12 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + cusparseStatus_t status; status = cusparseCreateCsrsv2Info(&(h->info)); if (CUSPARSE_STATUS_SUCCESS != status) @@ -178,85 +185,86 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, if (!std::is_same::value) sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); if (std::is_same::value) { cusparseDcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (double*)vals, (int*)rm, (int*)ent, h->info, + (double *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseDcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (double*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same::value) { cusparseScsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (float*)vals, (int*)rm, (int*)ent, h->info, + (float *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseScsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (float *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; - status = cusparseZcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuDoubleComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + status = + cusparseZcsrsv2_analysis(h->handle, h->transpose, nrows, nnz, + h->descr, (cuDoubleComplex *)vals, (int *)rm, + (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, - &pBufferSize); + (cuComplex *)vals, (int *)rm, (int *)ent, + h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseCcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; @@ -281,10 +289,11 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, } template < - typename KernelHandle, typename ain_row_index_view_type, - typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type, - typename b_values_scalar_view_type, typename x_values_scalar_view_type> -void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, + typename ExecutionSpace, typename KernelHandle, + typename ain_row_index_view_type, typename ain_nonzero_index_view_type, + typename ain_values_scalar_view_type, typename b_values_scalar_view_type, + typename x_values_scalar_view_type> +void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -323,6 +332,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + const scalar_type alpha = scalar_type(1.0); cudaDataType cudaValueType = cuda_data_type_from(); @@ -354,18 +366,21 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, if (std::is_same::value) { cusparseStatus_t status; - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int nnz = entries.extent_int(0); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr() - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); - const scalar_type* bv = rhs.data(); - scalar_type* xv = lhs.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); + const scalar_type *bv = rhs.data(); + scalar_type *xv = lhs.data(); if (std::is_same::value) { if (h->pBuffer == nullptr) { @@ -373,10 +388,10 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, } const double alpha = double(1); - status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (double*)vals, (int*)rm, - (int*)ent, h->info, (double*)bv, - (double*)xv, h->policy, h->pBuffer); + status = cusparseDcsrsv2_solve( + h->handle, h->transpose, nrows, nnz, &alpha, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, (double *)bv, (double *)xv, h->policy, + h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -387,9 +402,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, const float alpha = float(1); status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, (float*)bv, (float*)xv, - h->policy, h->pBuffer); + &alpha, h->descr, (float *)vals, (int *)rm, + (int *)ent, h->info, (float *)bv, + (float *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -399,8 +414,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseZcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, - (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer); + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, + (cuDoubleComplex *)bv, (cuDoubleComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -410,8 +425,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseCcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv, - (cuComplex*)xv, h->policy, h->pBuffer); + (cuComplex *)vals, (int *)rm, (int *)ent, h->info, (cuComplex *)bv, + (cuComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -539,13 +554,13 @@ void sptrsvcuSPARSE_solve_streams( "CUSPARSE requires local ordinals to be integer.\n"); } else { const scalar_type alpha = scalar_type(1.0); - std::vector sptrsv_handle_v(nstreams); - std::vector h_v(nstreams); - std::vector rm_v(nstreams); - std::vector ent_v(nstreams); - std::vector vals_v(nstreams); - std::vector bv_v(nstreams); - std::vector xv_v(nstreams); + std::vector sptrsv_handle_v(nstreams); + std::vector h_v(nstreams); + std::vector rm_v(nstreams); + std::vector ent_v(nstreams); + std::vector vals_v(nstreams); + std::vector bv_v(nstreams); + std::vector xv_v(nstreams); for (int i = 0; i < nstreams; i++) { sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); @@ -560,8 +575,8 @@ void sptrsvcuSPARSE_solve_streams( } rm_v[i] = !std::is_same::value ? sptrsv_handle_v[i]->get_int_rowmap_ptr() - : reinterpret_cast(row_map_v[i].data()); - ent_v[i] = reinterpret_cast(entries_v[i].data()); + : reinterpret_cast(row_map_v[i].data()); + ent_v[i] = reinterpret_cast(entries_v[i].data()); vals_v[i] = values_v[i].data(); bv_v[i] = rhs_v[i].data(); xv_v[i] = lhs_v[i].data(); @@ -573,42 +588,42 @@ void sptrsvcuSPARSE_solve_streams( if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else { throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index b14c9be072..1d8613922b 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2891,16 +2891,15 @@ void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, #endif -template -void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -2981,8 +2980,10 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedRPSolverFunctor( @@ -2990,8 +2991,8 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; - int team_size = thandle.get_team_size(); + using team_policy_t = Kokkos::TeamPolicy; + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor; + using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = Kokkos::View; @@ -3079,9 +3088,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; @@ -3118,7 +3130,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, auto Ljj = Kokkos::subview( viewL, range_type(0, nsrow), Kokkos::ALL()); // s-th supernocal column of L - KokkosBlas::gemv("N", one, Ljj, Xj, zero, Y); + KokkosBlas::gemv(space, "N", one, Ljj, Xj, zero, Y); } else { auto Xj = Kokkos::subview( lhs, @@ -3131,7 +3143,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (invert_diagonal) { auto Y = Kokkos::subview( work, range_type(workoffset, workoffset + nscol)); - KokkosBlas::gemv("N", one, Ljj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ljj, Y, zero, Xj); } else { char unit_diag = (unit_diagonal ? 'U' : 'N'); // NOTE: we currently supports only default_layout = @@ -3139,7 +3151,9 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "N", &unit_diag, one, Ljj, Xjj); + KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, + Xjj); + // TODO: space.fence(); Kokkos::fence(); } // update off-diagonal blocks @@ -3155,7 +3169,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, viewL, range_type(nscol, nsrow), Kokkos::ALL()); // off-diagonal blocks of s-th supernodal // column of L - KokkosBlas::gemv("N", one, Lij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Lij, Xj, zero, Z); } } } @@ -3165,9 +3179,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3178,9 +3195,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3200,7 +3220,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with spmv (one or two SpMV) bool transpose_spmv = @@ -3210,36 +3230,45 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update off-diagonals (potentiall combined with solve with // diagonals) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3272,16 +3301,16 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } // end lower_tri_solve -template -void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; - + using memory_space = typename ExecutionSpace::memory_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3298,7 +3327,6 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -3365,14 +3393,16 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedRPSolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; + using team_policy_t = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); @@ -3388,11 +3418,19 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, node_count); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); else - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, team_size), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, team_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } // TP2 algorithm has issues with some offset-ordinal combo to be addressed /* @@ -3444,7 +3482,7 @@ tstf); } // end elseif timer.reset(); #endif - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; if (thandle.is_column_major()) { // U stored in CSC if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3457,9 +3495,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) { @@ -3500,7 +3541,7 @@ tstf); } // end elseif workoffset, workoffset + nsrow)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } else { // extract part of the solution, corresponding to the diagonal // block @@ -3517,14 +3558,14 @@ tstf); } // end elseif workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - KokkosBlas::gemv("N", one, Ujj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = // LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "U", "N", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks if (nsrow2 > 0) { @@ -3538,7 +3579,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } } } @@ -3548,9 +3589,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3562,10 +3606,13 @@ tstf); } // end elseif diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - using policy_type = Kokkos::TeamPolicy; - Kokkos::parallel_for("parfor_usolve_tran_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_tran_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); } else { // U stored in CSR // launching sparse-triangular solve functor UpperTriSupernodalFunctor; - Kokkos::parallel_for("parfor_usolve_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3634,7 +3684,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("T", -one, Uij, Z, one, Xj); + KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); } // "triangular-solve" to compute Xj @@ -3642,13 +3692,13 @@ tstf); } // end elseif auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { - KokkosBlas::gemv("T", one, Ujj, Xj, zero, Y); + KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "T", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } if (invert_diagonal) { @@ -3657,9 +3707,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } } @@ -3680,7 +3733,7 @@ tstf); } // end elseif #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with one, or two, spmv bool transpose_spmv = @@ -3691,28 +3744,34 @@ tstf); } // end elseif if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // zero out lhs corresponding to diagonal blocks in lhs, and copy to // work SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update with off-diagonals (potentiall combined with diagonal // solves) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); } else { if (!invert_offdiagonal) { // zero out lhs corresponding to diagonal blocks in lhs, and copy to @@ -3720,17 +3779,20 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); // update with off-diagonals auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); } else { std::cout << " ** invert_offdiag with U in CSR not supported **" << std::endl; @@ -3740,9 +3802,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3765,23 +3830,22 @@ tstf); } // end elseif double sptrsv_time_seconds = sptrsv_timer.seconds(); std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; - std::cout << " + Execution space : " << execution_space::name() + std::cout << " + Execution space : " << ExecutionSpace::name() << std::endl; std::cout << " + Memory space : " << memory_space::name() << std::endl; #endif } // end upper_tri_solve -template -void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs, +template +void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, LHSType &lhs, const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3802,9 +3866,9 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines - using policy_type = Kokkos::TeamPolicy; + using policy_type = Kokkos::TeamPolicy; using large_cutoff_policy_type = - Kokkos::TeamPolicy; + Kokkos::TeamPolicy; /* using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = @@ -3865,14 +3929,17 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_l_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_l_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3884,7 +3951,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, if (team_size_singleblock <= 0) { team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -3907,7 +3974,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -3925,11 +3995,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? @@ -3955,16 +4029,19 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } // TODO To use cudagraph here, need to know how many non-unit chains // there are, create a graph for each and launch accordingly size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_u_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_u_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3980,7 +4057,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), // Kokkos::ParallelForTag()); team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -4003,7 +4080,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -4021,11 +4101,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index e36b9df236..6ad321c286 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -96,9 +96,9 @@ template ::value> struct SPTRSV_SOLVE { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x); + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x); static void sptrsv_solve_streams( const std::vector &execspace_v, @@ -117,9 +117,9 @@ template { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x) { + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x) { // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() @@ -127,40 +127,44 @@ struct SPTRSV_SOLVEis_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, true); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::lower_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, false); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::upper_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } @@ -188,7 +192,8 @@ struct SPTRSV_SOLVEis_lower_tri()) { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::lower_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } @@ -198,7 +203,8 @@ struct SPTRSV_SOLVE(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::upper_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 3ef3be8780..36ea2d9df8 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -147,9 +147,10 @@ void symbolic_chain_phase(TriSolveHandle& thandle, #endif } // end symbolic_chain_phase -template -void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; Kokkos::Timer timer; @@ -177,10 +178,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -193,11 +194,12 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // wait for deep copy write to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -227,9 +229,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, // Create the chain now if (thandle.algm_requires_symb_chain()) { + // No need to pass in space, chain phase runs on the host symbolic_chain_phase(thandle, nodes_per_level); } - thandle.set_symbolic_complete(); // Output check @@ -257,9 +259,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -279,6 +281,7 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", @@ -568,20 +571,21 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, thandle.set_workspace_size(max_lwork); // workspace offset initialized to be zero integer_view_t work_offset = thandle.get_work_offset(); - Kokkos::deep_copy(work_offset, work_offset_host); + Kokkos::deep_copy(space, work_offset, work_offset_host); // kernel types // > off-diagonal integer_view_t dkernel_type_by_level = thandle.get_kernel_type(); - Kokkos::deep_copy(dkernel_type_by_level, kernel_type_by_level); + Kokkos::deep_copy(space, dkernel_type_by_level, kernel_type_by_level); // > diagonal integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type(); - Kokkos::deep_copy(ddiag_kernel_type_by_level, diag_kernel_type_by_level); + Kokkos::deep_copy(space, ddiag_kernel_type_by_level, + diag_kernel_type_by_level); // deep copy to device (of scheduling info) - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); #ifdef TRISOLVE_SYMB_TIMERS std::cout << " + workspace time = " << timer.seconds() << std::endl; @@ -598,9 +602,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif } // end lower_tri_symbolic -template -void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; Kokkos::Timer timer; @@ -626,10 +631,10 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -642,11 +647,12 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // Wait for deep copy writes to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -708,9 +714,9 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -730,6 +736,7 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index 73389d10d0..5b9304356d 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -67,33 +67,37 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosSparse::sptrsv_symbolic -template ::value, bool eti_spec_avail = sptrsv_symbolic_eti_spec_avail< KernelHandle, RowMapType, EntriesType>::value> struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of sptrsv_symbolic // Unification layer -template -struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, +template +struct SPTRSV_SYMBOLIC { + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries) { auto sptrsv_handle = handle->get_sptrsv_handle(); auto nrows = row_map.extent(0) - 1; sptrsv_handle->new_init_handle(nrows); if (sptrsv_handle->is_lower_tri()) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } else { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } } @@ -113,6 +117,7 @@ struct SPTRSV_SYMBOLIC, \ @@ -130,6 +135,7 @@ struct SPTRSV_SYMBOLIC, \ diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 859918c58d..9ab7c9fc6a 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -40,10 +40,23 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value -template -void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries) { +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ +template +void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -94,8 +107,9 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, Entries_Internal entries_i = entries; KokkosSparse::Impl::SPTRSV_SYMBOLIC< - const_handle_type, RowMap_Internal, - Entries_Internal>::sptrsv_symbolic(&tmp_handle, rowmap_i, entries_i); + ExecutionSpace, const_handle_type, RowMap_Internal, + Entries_Internal>::sptrsv_symbolic(space, &tmp_handle, rowmap_i, + entries_i); #ifdef KK_TRISOLVE_TIMERS std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -103,10 +117,46 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ template + typename lno_nnz_view_t_> void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + lno_nnz_view_t_ entries) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries); +} + +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ +template +void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -179,11 +229,12 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, auto nrows = sh->get_nrows(); KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< - sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal>( - sh, nrows, rowmap_i, entries_i, values_i, false); + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, + false); } else { - KokkosSparse::Experimental::sptrsv_symbolic(handle, rowmap, entries); + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } #ifdef KK_TRISOLVE_TIMERS std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -191,12 +242,52 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ template -void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, - XType x) { + typename lno_nnz_view_t_, typename scalar_nnz_view_t_> +void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries, values); +} + +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param space The execution space instance this kernel will be run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, BType b, XType x) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -305,25 +396,65 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, sptrsvHandleType *sh = handle->get_sptrsv_handle(); auto nrows = sh->get_nrows(); - KokkosSparse::Impl::sptrsvcuSPARSE_solve( - sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + KokkosSparse::Impl::sptrsvcuSPARSE_solve< + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>( + space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); } else { KokkosSparse::Impl::SPTRSV_SOLVE< - typename scalar_nnz_view_t_::execution_space, const_handle_type, - RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, - XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, entries_i, + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i); } } // sptrsv_solve -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) -// --------------------------------------------------------------------- -template -void sptrsv_solve(KernelHandle *handle, XType x, XType b) { +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, + XType x) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, rowmap, entries, values, b, x); +} + +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) || defined(DOXY) +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, + XType b) { auto crsmat = handle->get_sptrsv_handle()->get_crsmat(); auto values = crsmat.values; auto graph = crsmat.graph; @@ -341,31 +472,79 @@ void sptrsv_solve(KernelHandle *handle, XType x, XType b) { if (handle->is_sptrsv_lower_tri()) { // apply forward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, x, x); + sptrsv_solve(space, handle, row_map, entries, values, x, x); } else { // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, b, b); + sptrsv_solve(space, handle, row_map, entries, values, b, b); // apply backward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); } } -// --------------------------------------------------------------------- +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ template -void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, - XType b) { +void sptrsv_solve(KernelHandle *handle, XType x, XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, + KernelHandle *handleU, XType x, XType b) { // Lower-triangular solve - sptrsv_solve(handleL, x, b); + sptrsv_solve(space, handleL, x, b); // copy the solution to rhs - Kokkos::deep_copy(b, x); + Kokkos::deep_copy(space, b, x); // uper-triangular solve - sptrsv_solve(handleU, x, b); + sptrsv_solve(space, handleU, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, + XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handleL, handleU, x, b); } #endif From c6248930b17b6be992ebb832393abd43a5e37c22 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 17 Oct 2023 08:58:54 -0600 Subject: [PATCH 002/326] cm_test_all_sandia: allow rocm/5.2.0 for caraway vega90A builds --- scripts/cm_test_all_sandia | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index f939060320..94e3bca730 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -716,7 +716,8 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/5.6.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "rocm/5.6.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/5.6.1 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" From f2f417d3c08fb6f6551f9828ee2f5add6f3660c9 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 23 May 2023 12:46:35 -0600 Subject: [PATCH 003/326] Implementation of the two BLAS level 2 routines syr2() and her(), under the Kokkos-Kernels routine syr2(). --- blas/CMakeLists.txt | 7 + .../KokkosBlas2_syr2_eti_spec_inst.cpp.in | 25 + .../KokkosBlas2_syr2_eti_spec_avail.hpp.in | 25 + blas/impl/KokkosBlas2_syr2_impl.hpp | 369 +++ blas/impl/KokkosBlas2_syr2_spec.hpp | 180 ++ blas/impl/KokkosBlas2_syr_impl.hpp | 2 +- blas/src/KokkosBlas2_syr2.hpp | 244 ++ blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp | 205 ++ blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp | 35 + .../KokkosBlas2_syr2_tpl_spec_decl_blas.hpp | 317 +++ .../KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp | 372 ++++ ...KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp | 336 +++ blas/tpls/KokkosBlas_Host_tpl.cpp | 65 + blas/tpls/KokkosBlas_Host_tpl.hpp | 9 + blas/unit_test/CMakeLists.txt | 4 + blas/unit_test/Test_Blas.hpp | 1 + blas/unit_test/Test_Blas2_syr2.hpp | 1970 +++++++++++++++++ 17 files changed, 4165 insertions(+), 1 deletion(-) create mode 100644 blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in create mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in create mode 100644 blas/impl/KokkosBlas2_syr2_impl.hpp create mode 100644 blas/impl/KokkosBlas2_syr2_spec.hpp create mode 100644 blas/src/KokkosBlas2_syr2.hpp create mode 100644 blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp create mode 100644 blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp create mode 100644 blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp create mode 100644 blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp create mode 100644 blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp create mode 100644 blas/unit_test/Test_Blas2_syr2.hpp diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index d6ce98dae9..816d68e443 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -304,6 +304,13 @@ KOKKOSKERNELS_GENERATE_ETI(Blas2_syr syr TYPE_LISTS FLOATS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Blas2_syr2 syr2 + COMPONENTS blas + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm COMPONENTS blas HEADER_LIST ETI_HEADERS diff --git a/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in b/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..669b5fd1aa --- /dev/null +++ b/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosBlas2_syr2_spec.hpp" + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_INST_BLOCK@ +} //IMPL +} //Kokkos diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..9e7a01653e --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_AVAIL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/impl/KokkosBlas2_syr2_impl.hpp b/blas/impl/KokkosBlas2_syr2_impl.hpp new file mode 100644 index 0000000000..69284e9547 --- /dev/null +++ b/blas/impl/KokkosBlas2_syr2_impl.hpp @@ -0,0 +1,369 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_IMPL_HPP_ +#define KOKKOSBLAS2_SYR2_IMPL_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosBlas { +namespace Impl { + +// Functor for the thread parallel version of SYR2. +// This functor parallelizes over rows of the input matrix A. +template +struct ThreadParallelSYR2 { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + ThreadParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) + : alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else if ((x_(i) == Kokkos::ArithTraits::zero()) && + (y_(i) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const XComponentType x_fixed(x_(i)); + const YComponentType y_fixed(y_(i)); + const IndexType N(A_.extent(1)); + + if constexpr (tJustTranspose) { + if (x_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + } + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + } + } + } else { + if (x_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + } + } + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + } + } + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Thread parallel version of SYR2. +template +void threadParallelSyr2(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + } else if (y.extent(0) == 0) { + // no entries to update + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + } else { + Kokkos::RangePolicy rangePolicy(space, 0, + A.extent(0)); + ThreadParallelSYR2 + functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[threadParallel]", rangePolicy, + functor); + } +} + +struct TeamParallelSYR2_LayoutLeftTag {}; +struct TeamParallelSYR2_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- + +// Functor for the team parallel version of SYR2, designed for +// performance on GPUs. The kernel depends on the layout of A. +template +struct TeamParallelSYR2 { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TeamParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) + : alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + public: + // LayoutLeft version: one team per column + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutLeftTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType j(team.league_rank()); + if ((x_(j) == Kokkos::ArithTraits::zero()) && + (y_(j) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const IndexType M(A_.extent(0)); + if constexpr (tJustTranspose) { + const XComponentType x_fixed(x_(j)); + const YComponentType y_fixed(y_(j)); + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); + } + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_(i) * x_fixed); + } + }); + } + } else { + const XComponentType x_fixed( + Kokkos::ArithTraits::conj(x_(j))); + const YComponentType y_fixed( + Kokkos::ArithTraits::conj(y_(j))); + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); + } + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * + y_(i) * x_fixed); + } + }); + } + } + } + } + } + + // LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutRightTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType i(team.league_rank()); + if ((x_(i) == Kokkos::ArithTraits::zero()) && + (y_(i) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); + const YComponentType y_fixed(y_(i)); + if constexpr (tJustTranspose) { + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + }); + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + }); + } + } else { + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + } + }); + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * + y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); + } + } + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Team parallel version of SYR2. +template +void teamParallelSyr2(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + return; + } else if (y.extent(0) == 0) { + // no entries to update + return; + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + return; + } + + constexpr bool isLayoutLeft = + std::is_same::value; + using layout_tag = + typename std::conditional::type; + using TeamPolicyType = Kokkos::TeamPolicy; + TeamPolicyType teamPolicy; + if (isLayoutLeft) { + // LayoutLeft: one team per column + teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); + } else { + // LayoutRight: one team per row + teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); + } + + TeamParallelSYR2 + functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[teamParallel]", teamPolicy, functor); +} + +// --------------------------------------------------------------------------------------------- + +// generalSyr2Impl(): +// - use thread parallel code (rangePolicy) if execution space is CPU; +// - use team parallel code (teamPolicy) if execution space is GPU. +// +// The 'enable_if' makes sure unused kernels are not instantiated. + +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + threadParallelSyr2(space, alpha, x, y, A); +} + +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + teamParallelSyr2(space, alpha, x, y, A); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas2_syr2_spec.hpp b/blas/impl/KokkosBlas2_syr2_spec.hpp new file mode 100644 index 0000000000..01637ba1d4 --- /dev/null +++ b/blas/impl/KokkosBlas2_syr2_spec.hpp @@ -0,0 +1,180 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_SPEC_HPP_ +#define KOKKOSBLAS2_SYR2_SPEC_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr2_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization availability +// KokkosBlas::Impl::SYR2. This is NOT for users!!! All the declarations of full +// specializations go in this header file. We may spread out definitions (see +// _INST macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// +// syr2 +// + +// Implementation of KokkosBlas::syr2. +template < + class ExecutionSpace, class XViewType, class YViewType, class AViewType, + bool tpl_spec_avail = syr2_tpl_spec_avail::value, + bool eti_spec_avail = syr2_eti_spec_avail::value> +struct SYR2 { + static void syr2(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::syr2[ETI]" + : "KokkosBlas::syr2[noETI]"); + + typedef typename AViewType::size_type size_type; + const size_type numRows = A.extent(0); + const size_type numCols = A.extent(1); + + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); + bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); + + // Prefer int as the index type, but use a larsyr2 type if needed. + if ((numRows < static_cast(INT_MAX)) && + (numCols < static_cast(INT_MAX))) { + if (justTranspose) { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } else { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } + } else { + if (justTranspose) { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } else { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } + } + + Kokkos::Profiling::popRegion(); + } +#else + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; + +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization of KokkosBlas::Impl::SYR2. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or more .cpp +// files. +// +#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS2_SYR2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include + +#endif // KOKKOSBLAS2_SYR2_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 439ed588db..685ca75997 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -94,7 +94,7 @@ void threadParallelSyr(const ExecutionSpace& space, A.extent(0)); ThreadParallelSYR functor(alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[thredParallel]", rangePolicy, + Kokkos::parallel_for("KokkosBlas::syr[threadParallel]", rangePolicy, functor); } } diff --git a/blas/src/KokkosBlas2_syr2.hpp b/blas/src/KokkosBlas2_syr2.hpp new file mode 100644 index 0000000000..29299b8268 --- /dev/null +++ b/blas/src/KokkosBlas2_syr2.hpp @@ -0,0 +1,244 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_HPP_ +#define KOKKOSBLAS2_SYR2_HPP_ + +#include + +namespace KokkosBlas { + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// matrix A that is: +/// - symmetric, A += alpha * x * y^T + alpha * y * x^T, or +/// - Hermitian, A += alpha * x * y^H + conj(alpha) * y * x^H. +/// +/// Important note 1: this routine encapsulates the syr2() and her2() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian. +/// Moreover, this routine will always compute either the lower +/// portion or the upper portion (per user's request) of the final +/// matrix A. So, in order to obtain meaningful results, the user +/// must make sure to follow the conditions specified in the +/// "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// Important note 4: Regarding parameter types: +/// - If A has components of real type (float or double), then: +/// - alpha must be of real type as well, +/// - components of x must be of real type as well, and +/// - components of y must be of real type as well. +/// - If A has components of complex type (complex or +/// complex), then: +/// - alpha must be of complex type as well (it may have zero +/// imaginary part, no problem), +/// - components of x may be of real type or complex type, and +/// - components of y may be of real type or complex type. +/// +/// \tparam ExecutionSpace The type of execution space +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param space [in] Execution space instance on which to run the kernel. +/// This may contain information about which stream to +/// run on. +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + static_assert( + Kokkos::SpaceAccessibility::assignable, + "AViewType memory space must be assignable from XViewType"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "AViewType memory space must be assignable from YViewType"); + + static_assert( + Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, + "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, + "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, + "YViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if ((A.extent(0) == A.extent(1)) && (A.extent(0) == x.extent(0)) && + (A.extent(0) == y.extent(0))) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas::syr2: Dimensions of A, x: " + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " + << x.extent(0) << ", y has size " << y.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas2::syr2(): invalid trans[0] = '" << trans[0] + << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || + (uplo[0] == 'l')) { + // Ok + } else { + std::ostringstream oss; + oss << "KokkosBlas2::syr2(): invalid uplo[0] = " << uplo[0] + << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; + throw std::runtime_error(oss.str()); + } + + if ((A.extent(0) == 0) || (A.extent(1) == 0)) { + return; + } + + using ALayout = typename AViewType::array_layout; + + // Minimize the number of Impl::SYR2 instantiations, by standardizing + // on particular View specializations for its template parameters. + typedef Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits > + XVT; + + typedef Kokkos::View::array_layout, + typename YViewType::device_type, + Kokkos::MemoryTraits > + YVT; + + typedef Kokkos::View > + AVT; + + Impl::SYR2::syr2(space, trans, uplo, alpha, x, + y, A); +} + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// matrix A that is: +/// - symmetric, A += alpha * x * y^T + alpha * y * x^T, or +/// - Hermitian, A += alpha * x * y^H + conj(alpha) * y * x^H. +/// +/// Important note 1: this routine encapsulates the syr2() and her2() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian. +/// Moreover, this routine will always compute either the lower +/// portion or the upper portion (per user's request) of the final +/// matrix A. So, in order to obtain meaningful results, the user +/// must make sure to follow the conditions specified in the +/// "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// Important note 4: Regarding parameter types: +/// - If A has components of real type (float or double), then: +/// - alpha must be of real type as well, +/// - components of x must be of real type as well, and +/// - components of y must be of real type as well. +/// - If A has components of complex type (complex or +/// complex), then: +/// - alpha must be of complex type as well (it may have zero +/// imaginary part, no problem), +/// - components of x may be of real type or complex type, and +/// - components of y may be of real type or complex type. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr2(const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + syr2( + space, trans, uplo, alpha, x, y, A); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp new file mode 100644 index 0000000000..59fb154d35 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp @@ -0,0 +1,205 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr2_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side BLAS (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, + Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) + +#endif +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp new file mode 100644 index 0000000000..66ba81b685 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp @@ -0,0 +1,35 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_HPP_ + +// BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +#include +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include +#endif + +#endif diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp new file mode 100644 index 0000000000..8561675c72 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp @@ -0,0 +1,317 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_BLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_BLAS_HPP_ + +#include "KokkosBlas_Host_tpl.hpp" + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + +#define KOKKOSBLAS2_DSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ + one, A.data(), LDA); \ + } else { \ + /* blasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ + one, A.data(), LDA); \ + } else { \ + /* blasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::zher2( \ + uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::cher2( \ + uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) +#endif + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp new file mode 100644 index 0000000000..ca98fedf0d --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp @@ -0,0 +1,372 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_CUBLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_CUBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? CUBLAS_FILL_MODE_LOWER \ + : CUBLAS_FILL_MODE_UPPER; + +#define KOKKOSBLAS2_DSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp new file mode 100644 index 0000000000..e6dfef7c6d --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -0,0 +1,336 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_ROCBLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_ROCBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower \ + : rocblas_fill_upper; + +#define KOKKOSBLAS2_DSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index b85f6109e8..a7be0d31ab 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -272,6 +272,35 @@ void F77_BLAS_MANGLE(zher, ZHER)(const char*, int*, const double*, const std::complex*, int*, std::complex*, int*); +/// +/// Syr2 +/// +void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, int*, const float*, + const float*, const int*, const float*, int*, + float*, int*); +void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, int*, const double*, + const double*, const int*, const double*, + int*, double*, int*); +// Although there is a cgeru, there is no csyr2u +// Although there is a zgeru, there is no zsyr2u +// Although there is a cgerc, there is no csyr2c, but there is cher2 (see below) +// Although there is a zgerc, there is no zsyr2c, but there is zher2 (see below) + +/// +/// Her2 +/// + +void F77_BLAS_MANGLE(cher2, CHER2)(const char*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); + /// /// Trsv /// @@ -499,6 +528,12 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CHER F77_BLAS_MANGLE(cher, CHER) #define F77_FUNC_ZHER F77_BLAS_MANGLE(zher, ZHER) +#define F77_FUNC_SSYR2 F77_BLAS_MANGLE(ssyr2, SSYR2) +#define F77_FUNC_DSYR2 F77_BLAS_MANGLE(dsyr2, DSYR2) + +#define F77_FUNC_CHER2 F77_BLAS_MANGLE(cher2, CHER2) +#define F77_FUNC_ZHER2 F77_BLAS_MANGLE(zher2, ZHER2) + #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) #define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV) @@ -611,6 +646,12 @@ void HostBlas::syr(const char uplo, int n, const float alpha, F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> +void HostBlas::syr2(const char uplo, int n, const float alpha, + const float* x, int incx, const float* y, int incy, + float* a, int lda) { + F77_FUNC_SSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const float* a, int lda, /* */ float* b, int ldb) { @@ -735,6 +776,12 @@ void HostBlas::syr(const char uplo, int n, const double alpha, F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> +void HostBlas::syr2(const char uplo, int n, const double alpha, + const double* x, int incx, const double* y, + int incy, double* a, int lda) { + F77_FUNC_DSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const double* a, int lda, /* */ double* b, int ldb) { @@ -889,6 +936,15 @@ void HostBlas >::cher( (std::complex*)a, &lda); } template <> +void HostBlas >::cher2( + const char uplo, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, (std::complex*)a, + &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, int lda, @@ -1067,6 +1123,15 @@ void HostBlas >::zher( (std::complex*)a, &lda); } template <> +void HostBlas >::zher2( + const char uplo, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, + (std::complex*)a, &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 6f6c34dc25..3b0c7f366e 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -76,6 +76,9 @@ struct HostBlas { static void syr(const char uplo, int n, const T alpha, const T *x, int incx, T *a, int lda); + static void syr2(const char uplo, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + template static void cher(const char uplo, int n, const tAlpha alpha, const T *x, int incx, T *a, int lda); @@ -84,6 +87,12 @@ struct HostBlas { static void zher(const char uplo, int n, const tAlpha alpha, const T *x, int incx, T *a, int lda); + static void cher2(const char uplo, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + + static void zher2(const char uplo, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, /* */ T *b, int ldb); diff --git a/blas/unit_test/CMakeLists.txt b/blas/unit_test/CMakeLists.txt index b0ccaf8e7e..49dffe0454 100644 --- a/blas/unit_test/CMakeLists.txt +++ b/blas/unit_test/CMakeLists.txt @@ -92,3 +92,7 @@ IF (KOKKOS_ENABLE_THREADS) ) ENDIF () +#KOKKOSKERNELS_ADD_EXECUTABLE( +# old_scalar_parameter +# SOURCES scalar_parameter.cpp +# ) diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index 1f4f130e8b..1abd288b0f 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -63,6 +63,7 @@ #include "Test_Blas2_gemv.hpp" #include "Test_Blas2_ger.hpp" #include "Test_Blas2_syr.hpp" +#include "Test_Blas2_syr2.hpp" // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp new file mode 100644 index 0000000000..5cc45552c8 --- /dev/null +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -0,0 +1,1970 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operations: +// --> A += alpha * x * y^T + alpha * y * x^T, or +// --> A += alpha * x * y^H + conj(alpha) * y * x^H +// 01) Type of 'x' components: float, double, complex, ... +// 02) Type of 'y' components: float, double, complex, ... +// 03) Type of 'A' components: float, double, complex, ... +// 04) Execution space: serial, threads, OpenMP, Cuda, ... +// 05) Layout of 'x' +// 06) Layout of 'y' +// 07) Layout of 'A' +// 08) Dimension of 'A' +// 09) Options 'const' or 'non const' for x view, when calling syr2() +// 10) Options 'const' or 'non const' for y view, when calling syr2() +// 11) Usage of analytical results in the tests +// 12) Options 'T' or 'H' when calling syr2() +// 13) Options 'U' or 'L' when calling syr2() +// +// Choices (01)-(05) are selected in the routines TEST_F() at the +// very bottom of the file, when calling test_syr2<...>(). +// +// Choices (06)-(13) are selected in routine test_syr2<...>(), +// when calling the method test() of class Test::Syr2Tester<...>. +// +// The class Test::Syr2Tester<...> represents the "core" of the test +// logic, where all calculations, comparisons, and success/failure +// decisions are performed. +// +// A high level explanation of method Test::SyrTester<...>::test() +// is given by the 7 steps named "Step 1 of 7" to "Step 7 of 7" +// in the code below. +// ********************************************************************** + +#include +#include +#include +#include +#include + +namespace Test { + +template +class Syr2Tester { + public: + Syr2Tester(); + + ~Syr2Tester(); + + void test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults = false, + const bool useHermitianOption = false, + const bool useUpOption = false); + + private: + typedef Kokkos::View _ViewTypeX; + typedef Kokkos::View _ViewTypeY; + typedef Kokkos::View _ViewTypeA; + + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeY::HostMirror _HostViewTypeY; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef Kokkos::View + _ViewTypeExpected; + + typedef Kokkos::ArithTraits _KAT_A; + typedef typename _KAT_A::mag_type _AuxType; + + void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, _ViewTypeX& x, + _ViewTypeY& y, _ViewTypeA& A, + bool& expectedResultIsKnown); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + T shrinkAngleToZeroTwoPiRange(const T input); + + template + void callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation); + + template + void callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _ViewTypeExpected& h_A_syr2, const std::string& situation); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; + const _AuxType _absTol; + const _AuxType _relTol; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _useUpOption; + bool _kkSyr2ShouldThrowException; + bool _kkGerShouldThrowException; +}; + +template +Syr2Tester::Syr2Tester() + : _A_is_complex(std::is_same>::value || + std::is_same>::value), + _A_is_lr(std::is_same::value), + _A_is_ll(std::is_same::value), + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Device::execution_space>()) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + , + _vanillaUsesDifferentOrderOfOps(_A_is_lr) +#else + , + _vanillaUsesDifferentOrderOfOps(false) +#endif + , + // **************************************************************** + // Tolerances for double can be tighter than tolerances for float. + // + // In the case of calculations with float, a small amount of + // discrepancies between reference results and CUDA results are + // large enough to require 'relTol' to value 5.0e-3. The same + // calculations show no discrepancies for calculations with double. + // **************************************************************** + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _M(-1), + _N(-1), + _useAnalyticalResults(false), + _useHermitianOption(false), + _useUpOption(false), + _kkSyr2ShouldThrowException(false), + _kkGerShouldThrowException(false) { +} + +template +Syr2Tester::~Syr2Tester() { + // Nothing to do +} + +template +void Syr2Tester::test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults, + const bool useHermitianOption, + const bool useUpOption) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Entering Syr2Tester::test()... - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - " + << std::endl; + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " + << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol + << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults + << ", useHermitianOption = " << useHermitianOption + << ", useUpOption = " << useUpOption << std::endl; +#endif + // ******************************************************************** + // Step 1 of 7: declare main types and variables + // ******************************************************************** + _M = N; + _N = N; + _useAnalyticalResults = useAnalyticalResults; + _useHermitianOption = useHermitianOption; + _useUpOption = useUpOption; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + _kkSyr2ShouldThrowException = false; + + _kkGerShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + _kkGerShouldThrowException = !_A_is_ll; + } +#endif + + bool test_x(false); + bool test_cx(false); + if (nonConstConstCombinations == 0) { + test_x = true; + } else if (nonConstConstCombinations == 1) { + test_cx = true; + } else { + test_x = true; + test_cx = true; + } + + view_stride_adapter<_ViewTypeX, false> x("X", _M); + view_stride_adapter<_ViewTypeY, false> y("Y", _N); + view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); + + view_stride_adapter<_ViewTypeExpected, true> h_expected( + "expected A += alpha * x * x^{t,h}", _M, _N); + bool expectedResultIsKnown = false; + + using AlphaCoeffType = typename _ViewTypeA::non_const_value_type; + ScalarA alpha(Kokkos::ArithTraits::zero()); + + // ******************************************************************** + // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A + // ******************************************************************** + this->populateVariables(alpha, x.h_view, y.h_view, A.h_view, + h_expected.d_view, x.d_view, y.d_view, A.d_view, + expectedResultIsKnown); + + // ******************************************************************** + // Step 3 of 7: populate h_vanilla + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_vanilla( + "vanilla = A + alpha * x * x^{t,h}", _M, _N); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); +#endif + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, + h_vanilla.d_view); + + // ******************************************************************** + // Step 4 of 7: use h_vanilla and h_expected as appropriate + // ******************************************************************** + if (expectedResultIsKnown) { + // ****************************************************************** + // Compare h_vanilla against h_expected + // ****************************************************************** + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, + h_expected.d_view); + } else { + // ****************************************************************** + // Copy h_vanilla to h_expected + // ****************************************************************** + Kokkos::deep_copy(h_expected.d_base, h_vanilla.d_base); + } + + // ******************************************************************** + // Step 5 of 7: test with 'non const x' + // ******************************************************************** + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A.d_base, A.d_base); + Kokkos::deep_copy(org_A.h_view, A.h_view); + + if (test_x) { + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, + A.d_view, A.h_view, + h_expected.d_view, "non const x"); + + if ((_useAnalyticalResults == false) && // Just to save run time + (_kkGerShouldThrowException == false)) { + this->callKkGerAndCompareKkSyr2AgainstIt(alpha, x.d_view, y.d_view, org_A, + A.h_view, "non const x"); + } + } + + // ******************************************************************** + // Step 6 of 7: test with const x + // ******************************************************************** + if (test_cx) { + Kokkos::deep_copy(A.d_base, org_A.d_base); + + this->callKkSyr2AndCompareAgainstExpected( + alpha, x.d_view_const, y.d_view_const, A.d_view, A.h_view, + h_expected.d_view, "const x"); + } + + // ******************************************************************** + // Step 7 of 7: tests with invalid values on the first input parameter + // ******************************************************************** + EXPECT_ANY_THROW( + KokkosBlas::syr2(".", "U", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("", "U", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for mode ''"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("T", ".", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for uplo '.'"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("T", "", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for uplo ''"; + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Leaving Syr2Tester::test() - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - " + << std::endl; +#endif +} + +template +void Syr2Tester::populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, + _ViewTypeX& x, _ViewTypeY& y, + _ViewTypeA& A, + bool& expectedResultIsKnown) { + expectedResultIsKnown = false; + + if (_useAnalyticalResults) { + this->populateAnalyticalValues(alpha, h_x, h_y, h_A, h_expected); + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + expectedResultIsKnown = true; + } else if (_N == 1) { + alpha = 3; + + h_x[0] = 2; + + h_y[0] = 4; + + h_A(0, 0) = 7; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0, 0) = 55; + expectedResultIsKnown = true; + } else if (_N == 2) { + alpha = 3; + + h_x[0] = -2; + h_x[1] = 9; + + h_y[0] = 5; + h_y[1] = -4; + + h_A(0, 0) = 17; + h_A(0, 1) = -43; + h_A(1, 0) = -43; + h_A(1, 1) = 101; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + if (_useUpOption) { + h_expected(0, 0) = -43; + h_expected(0, 1) = 116; + h_expected(1, 0) = -43; + h_expected(1, 1) = -115; + } else { + h_expected(0, 0) = -43; + h_expected(0, 1) = -43; + h_expected(1, 0) = 116; + h_expected(1, 1) = -115; + } + expectedResultIsKnown = true; + } else { + alpha = 3; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + } + + { + ScalarY randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(y, rand_pool, randStart, randEnd); + } + + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(h_A, A); + + if (_useHermitianOption && _A_is_complex) { + // **************************************************************** + // Make h_A Hermitian + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = _KAT_A::conj(h_A(j, i)); + } + } + + for (int i(0); i < _N; ++i) { + h_A(i, i) = 0.5 * (h_A(i, i) + _KAT_A::conj(h_A(i, i))); + } + } else { + // **************************************************************** + // Make h_A symmetric + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = h_A(j, i); + } + } + } + Kokkos::deep_copy(A, h_A); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + alpha.real() = 1.4; + alpha.imag() = -2.3; + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i].real() = sin(auxI); + h_x[i].imag() = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_y[i].real() = cos(auxI); + h_y[i].imag() = cos(auxI); + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = -sin(auxImJ); + } else { + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = sin(auxImJ); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = sin(auxIpJ); + } + } + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j).real() = 3.8 * sin(auxIpJ); + h_expected(i, j).imag() = -5.6 * sin(auxImJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j).real() = 5.6 * sin(auxIpJ); + h_expected(i, j).imag() = 3.8 * sin(auxIpJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + alpha = 1.1; + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i] = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_y[i] = cos(auxI); + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j) = .1 * sin(auxIpJ); + } + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j) = 1.2 * sin(auxIpJ); + } else { + h_expected(i, j) = h_A(i, j); + } + } + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_vanillaUsesDifferentOrderOfOps) { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i) + + _KAT_A::conj(alpha) * _KAT_A::conj(h_x(j)) * h_y(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } else { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_useHermitianOption) { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * _KAT_A::conj(h_y(i)) + + _KAT_A::conj(alpha) * h_y(j) * _KAT_A::conj(h_x(i)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } else { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } +} + +template +template +T Syr2Tester::shrinkAngleToZeroTwoPiRange(const T input) { + T output(input); +#if 0 + T twoPi( 2. * Kokkos::numbers::pi ); + if (input > 0.) { + output -= std::floor( input / twoPi ) * twoPi; + } + else if (input < 0.) { + output += std::floor( -input / twoPi ) * twoPi; + } +#endif + return output; +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + errorHappened = false; + if (h_expected(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - " + "h_vanilla(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + errorHappened = false; + if (h_expected(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - " + "h_vanilla(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_vanilla(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_vanilla(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + } else { + int numErrorsReal(0); + int numErrorsImag(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j).real() != h_vanilla(i, j).real()) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " + << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << std::endl; + } +#endif + numErrorsReal++; + } + + if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " + << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << std::endl; + } +#endif + numErrorsImag++; + } + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); + errorHappened = false; + if (h_expected(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_vanilla(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } else { + int numErrors(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j) != h_vanilla(i, j)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; + } +#endif + numErrors++; + } + } // for j + } // for i + EXPECT_EQ(numErrors, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester:: + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j).real() - h_A(i, j).real()); + errorHappened = false; + if (h_reference(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); + errorHappened = false; + if (h_reference(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + if ((_M == 2131) && (_N == 2131)) { + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ", " << h_reference(11, 2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " + << h_A(11, 2119).imag() << ")" << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ", " << h_reference(710, 1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " + << h_A(710, 1065).imag() << ")" << std::endl; + } +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr2 result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr2 result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester:: + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j) - h_A(i, j)); + errorHappened = false; + if (h_reference(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) + << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +template +template +void Syr2Tester:: + callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha + << std::endl; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::syr2(): " + "ViewTypeA = %s, _kkSyr2ShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkSyr2ShouldThrowException); +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + std::string uplo = _useUpOption ? "U" : "L"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::syr2(mode.c_str(), uplo.c_str(), alpha, x, y, A); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "': caught exception, e.what() = " << e.what() << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "': caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened"; + + EXPECT_EQ(gotStdException, _kkSyr2ShouldThrowException) + << "Failed test, '" << situation << "': kk syr2() should" + << (_kkSyr2ShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; + + if ((gotStdException == false) && (gotUnknownException == false)) { + Kokkos::deep_copy(h_A, A); + this->compareKkSyr2AgainstReference(alpha, h_A, h_expected); + } +} + +template +template +void Syr2Tester:: + callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _ViewTypeExpected& h_A_syr2, const std::string& situation) { + view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); + Kokkos::deep_copy(A_ger.d_base, org_A.d_base); + + // ******************************************************************** + // Call ger() + // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha + << std::endl; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::ger(): " + "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkGerShouldThrowException); +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::ger(mode.c_str(), alpha, x, y, A_ger.d_view); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 1: caught exception, e.what() = " << e.what() + << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 1: caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call 1"; + + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() 1 should not have thrown a std::exception"; + + // ******************************************************************** + // Call ger() again + // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::ger() again"); +#endif + try { + if (_useHermitianOption) { + KokkosBlas::ger(mode.c_str(), _KAT_A::conj(alpha), y, x, A_ger.d_view); + } else { + KokkosBlas::ger(mode.c_str(), alpha, y, x, A_ger.d_view); + } + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 2: caught exception, e.what() = " << e.what() + << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 2: caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call 2"; + + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() 2 should not have thrown a std::exception"; + + // ******************************************************************** + // Prepare h_ger_reference to be compared against h_A_syr2 + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference( + "h_ger_reference", _M, _N); + Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); + + std::string uplo = _useUpOption ? "U" : "L"; + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + // Keep h_ger_reference as already computed + } else { + h_ger_reference.d_view(i, j) = org_A.h_view(i, j); + } + } + } + if (_useHermitianOption && _A_is_complex) { + for (int i(0); i < _N; ++i) { + h_ger_reference.d_view(i, i) = + 0.5 * (h_ger_reference.d_view(i, i) + + _KAT_A::conj(h_ger_reference.d_view(i, i))); + } + } + + // ******************************************************************** + // Compare + // ******************************************************************** + this->compareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.d_view); +} + +} // namespace Test + +template +#ifdef HAVE_KOKKOSKERNELS_DEBUG +int test_syr2(const std::string& caseName) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); +#else +int test_syr2(const std::string& /*caseName*/) { +#endif + bool xBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool yBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool aBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool useAnalyticalResults = xBool && yBool && aBool; + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", + caseName.c_str()); +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", + caseName.c_str()); +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", + caseName.c_str()); +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", + caseName.c_str()); +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(1, 0); + tester.test(2, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + } + + if (true) { + Test::Syr2Tester + tester; + tester.test(1024, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); +#endif + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_float"); + test_syr2("test case syr2_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_complex_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_float"); + test_syr2, Kokkos::complex, + Kokkos::complex, TestExecSpace>( + "test case syr2_complex_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_double"); + test_syr2("test case syr2_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_double"); + test_syr2, Kokkos::complex, + Kokkos::complex, TestExecSpace>( + "test case syr2_complex_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int"); + test_syr2("test case syr2_int"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, syr2_int_float_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int_float_double"); + test_syr2("test case syr2_mixed_types"); + Kokkos::Profiling::popRegion(); +} +#endif From 896fac1c5fce4c4522852fe5cb13d4aab9cffad1 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 15 Aug 2023 14:22:15 -0600 Subject: [PATCH 004/326] Removing unnecessary code --- blas/unit_test/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/blas/unit_test/CMakeLists.txt b/blas/unit_test/CMakeLists.txt index 49dffe0454..b0ccaf8e7e 100644 --- a/blas/unit_test/CMakeLists.txt +++ b/blas/unit_test/CMakeLists.txt @@ -92,7 +92,3 @@ IF (KOKKOS_ENABLE_THREADS) ) ENDIF () -#KOKKOSKERNELS_ADD_EXECUTABLE( -# old_scalar_parameter -# SOURCES scalar_parameter.cpp -# ) From 1b2ead5ffa5c26cc149e41d0e08ea62ee81fdab2 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 17 Oct 2023 12:21:48 -0600 Subject: [PATCH 005/326] Addressing feedbacks from Luc. --- blas/src/KokkosBlas2_syr2.hpp | 16 +--- blas/unit_test/Test_Blas2_syr2.hpp | 117 ++++++++++++----------------- 2 files changed, 54 insertions(+), 79 deletions(-) diff --git a/blas/src/KokkosBlas2_syr2.hpp b/blas/src/KokkosBlas2_syr2.hpp index 29299b8268..c9a2f7b2c5 100644 --- a/blas/src/KokkosBlas2_syr2.hpp +++ b/blas/src/KokkosBlas2_syr2.hpp @@ -18,6 +18,7 @@ #define KOKKOSBLAS2_SYR2_HPP_ #include +#include namespace KokkosBlas { @@ -80,15 +81,6 @@ template ::assignable, - "AViewType memory space must be assignable from XViewType"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "AViewType memory space must be assignable from YViewType"); - static_assert( Kokkos::SpaceAccessibility::accessible, @@ -109,11 +101,11 @@ void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, + static_assert(static_cast(AViewType::rank()) == 2, "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, + static_assert(static_cast(XViewType::rank()) == 1, "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, + static_assert(static_cast(YViewType::rank()) == 1, "YViewType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 5cc45552c8..e8cf006810 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -302,9 +302,8 @@ void Syr2Tester h_vanilla( "vanilla = A + alpha * x * x^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = %s\n", - typeid(alpha).name()); + std::cout << "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = " + << typeid(alpha).name() << std::endl; #endif this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); @@ -1526,10 +1525,10 @@ void Syr2Tester #ifdef HAVE_KOKKOSKERNELS_DEBUG int test_syr2(const std::string& caseName) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "+=======================================================================" - "===\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); + std::cout << "+=======================================================================" + << "===" << std::endl; + std::cout << "Starting " << caseName << "..." << std::endl; #else int test_syr2(const std::string& /*caseName*/) { #endif @@ -1715,11 +1715,9 @@ int test_syr2(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "+-----------------------------------------------------------------------" - "---\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", - caseName.c_str()); + std::cout << "+-----------------------------------------------------------------------" + << "---" << std::endl; + std::cout << "Starting " << caseName << " for LAYOUTLEFT ..." << std::endl; #endif if (true) { Test::Syr2Tester("test case syr2_float"); + test_syr2("test case syr2_float"); Kokkos::Profiling::popRegion(); } #endif @@ -1922,7 +1905,7 @@ TEST_F(TestCategory, syr2_float) { TEST_F(TestCategory, syr2_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_float"); test_syr2, Kokkos::complex, - Kokkos::complex, TestExecSpace>( + Kokkos::complex, TestDevice>( "test case syr2_complex_float"); Kokkos::Profiling::popRegion(); } @@ -1933,7 +1916,7 @@ TEST_F(TestCategory, syr2_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_double"); - test_syr2("test case syr2_double"); + test_syr2("test case syr2_double"); Kokkos::Profiling::popRegion(); } #endif @@ -1944,7 +1927,7 @@ TEST_F(TestCategory, syr2_double) { TEST_F(TestCategory, syr2_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_double"); test_syr2, Kokkos::complex, - Kokkos::complex, TestExecSpace>( + Kokkos::complex, TestDevice>( "test case syr2_complex_double"); Kokkos::Profiling::popRegion(); } @@ -1955,7 +1938,7 @@ TEST_F(TestCategory, syr2_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int"); - test_syr2("test case syr2_int"); + test_syr2("test case syr2_int"); Kokkos::Profiling::popRegion(); } #endif @@ -1964,7 +1947,7 @@ TEST_F(TestCategory, syr2_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr2_int_float_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int_float_double"); - test_syr2("test case syr2_mixed_types"); + test_syr2("test case syr2_mixed_types"); Kokkos::Profiling::popRegion(); } #endif From c53690978535f35e6c00fb64be5844ef70093a59 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 17 Oct 2023 12:27:37 -0600 Subject: [PATCH 006/326] Formatting --- blas/unit_test/Test_Blas2_syr2.hpp | 56 ++++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index e8cf006810..080c106b9f 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -1622,7 +1622,8 @@ void Syr2Tester #ifdef HAVE_KOKKOSKERNELS_DEBUG int test_syr2(const std::string& caseName) { - std::cout << "+=======================================================================" - << "===" << std::endl; + std::cout << "+==============================================================" + "============" + << std::endl; std::cout << "Starting " << caseName << "..." << std::endl; #else int test_syr2(const std::string& /*caseName*/) { @@ -1715,8 +1717,9 @@ int test_syr2(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; std::cout << "Starting " << caseName << " for LAYOUTLEFT ..." << std::endl; #endif if (true) { @@ -1749,8 +1752,9 @@ int test_syr2(const std::string& /*caseName*/) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished " << caseName << " for LAYOUTLEFT" << std::endl; - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; #endif #endif @@ -1758,8 +1762,9 @@ int test_syr2(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; std::cout << "Starting " << caseName << " for LAYOUTRIGHT ..." << std::endl; #endif if (true) { @@ -1792,8 +1797,9 @@ int test_syr2(const std::string& /*caseName*/) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished " << caseName << " for LAYOUTRIGHT" << std::endl; - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; #endif #endif @@ -1801,8 +1807,9 @@ int test_syr2(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; std::cout << "Starting " << caseName << " for LAYOUTSTRIDE ..." << std::endl; #endif if (true) { @@ -1836,16 +1843,18 @@ int test_syr2(const std::string& /*caseName*/) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished " << caseName << " for LAYOUTSTRIDE" << std::endl; - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; #endif #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; std::cout << "Starting " << caseName << " for MIXED LAYOUTS ..." << std::endl; #endif if (true) { @@ -1876,15 +1885,17 @@ int test_syr2(const std::string& /*caseName*/) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished " << caseName << " for MIXED LAYOUTS" << std::endl; - std::cout << "+-----------------------------------------------------------------------" - << "---" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; #endif #endif #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished " << caseName << std::endl; - std::cout << "+=======================================================================" - << "===" << std::endl; + std::cout << "+==============================================================" + "============" + << std::endl; #endif return 1; } @@ -1905,8 +1916,7 @@ TEST_F(TestCategory, syr2_float) { TEST_F(TestCategory, syr2_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_float"); test_syr2, Kokkos::complex, - Kokkos::complex, TestDevice>( - "test case syr2_complex_float"); + Kokkos::complex, TestDevice>("test case syr2_complex_float"); Kokkos::Profiling::popRegion(); } #endif From c2511ca732a2db481dfdae314fe9aa7fff39e94e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 17 Oct 2023 13:17:03 -0600 Subject: [PATCH 007/326] par_ilut: Update documentation for fill_in_limit Also change default from 0.75 to 2.0 --- sparse/src/KokkosKernels_Handle.hpp | 2 +- sparse/src/KokkosSparse_par_ilut_handle.hpp | 8 +++++++- sparse/unit_test/Test_Sparse_par_ilut.hpp | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index d500f19d48..2029618c8b 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -967,7 +967,7 @@ class KokkosKernelsHandle { const size_type max_iter = 20, const typename PAR_ILUTHandleType::float_t residual_norm_delta_stop = 1e-2, - const typename PAR_ILUTHandleType::float_t fill_in_limit = 0.75, + const typename PAR_ILUTHandleType::float_t fill_in_limit = 2.0, const bool async_update = false, const bool verbose = false) { this->destroy_par_ilut_handle(); this->is_owner_of_the_par_ilut_handle = true; diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 3ffe44ffca..fc1d6ee92a 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -78,7 +78,13 @@ class PAR_ILUTHandle { /// iteration to iteration drops below /// this, the algorithm will stop (even if /// max_iters has not been hit) - float_t fill_in_limit; /// The threshold for the ILU factorization + float_t fill_in_limit; /// The threshold for removing candidates + /// from the intermediate L and U is set such + /// that the resulting sparsity pattern has + /// at most `fill_in_limit` times the number + /// of non-zeros of the ILU(0) + /// factorization. This selection is executed + /// separately for both factors L and U. bool async_update; /// Whether compute LU factors should do asychronous /// updates. When ON, the algorithm will usually converge /// faster but it makes the algorithm non-deterministic. diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 4370ebe37e..8bc8f1d9b0 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -180,6 +180,7 @@ void run_test_par_ilut() { auto par_ilut_handle = kh.get_par_ilut_handle(); par_ilut_handle->set_async_update(false); + par_ilut_handle->set_fill_in_limit(0.75); // Allocate L and U CRS views as outputs RowMapType L_row_map("L_row_map", nrows + 1); From 6346c983add10dec851e77beeeebc6658f0a4f28 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 17 Oct 2023 13:31:16 -0600 Subject: [PATCH 008/326] Formatting --- sparse/src/KokkosSparse_par_ilut_handle.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index fc1d6ee92a..5ea4b3c436 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -78,13 +78,13 @@ class PAR_ILUTHandle { /// iteration to iteration drops below /// this, the algorithm will stop (even if /// max_iters has not been hit) - float_t fill_in_limit; /// The threshold for removing candidates - /// from the intermediate L and U is set such - /// that the resulting sparsity pattern has - /// at most `fill_in_limit` times the number - /// of non-zeros of the ILU(0) - /// factorization. This selection is executed - /// separately for both factors L and U. + float_t fill_in_limit; /// The threshold for removing candidates + /// from the intermediate L and U is set such + /// that the resulting sparsity pattern has + /// at most `fill_in_limit` times the number + /// of non-zeros of the ILU(0) + /// factorization. This selection is executed + /// separately for both factors L and U. bool async_update; /// Whether compute LU factors should do asychronous /// updates. When ON, the algorithm will usually converge /// faster but it makes the algorithm non-deterministic. From aeda2b7fbedea60c4ef3082c1a2b5d25a2b0671c Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 18 Oct 2023 10:20:57 -0600 Subject: [PATCH 009/326] Set default fill_in_limit back to 0.75 --- sparse/src/KokkosKernels_Handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 2029618c8b..d500f19d48 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -967,7 +967,7 @@ class KokkosKernelsHandle { const size_type max_iter = 20, const typename PAR_ILUTHandleType::float_t residual_norm_delta_stop = 1e-2, - const typename PAR_ILUTHandleType::float_t fill_in_limit = 2.0, + const typename PAR_ILUTHandleType::float_t fill_in_limit = 0.75, const bool async_update = false, const bool verbose = false) { this->destroy_par_ilut_handle(); this->is_owner_of_the_par_ilut_handle = true; From 24ab74b63d6c3fa0aa361b9536b4957345295429 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 18 Oct 2023 10:23:06 -0600 Subject: [PATCH 010/326] Remove setting to default --- sparse/unit_test/Test_Sparse_par_ilut.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 8bc8f1d9b0..4370ebe37e 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -180,7 +180,6 @@ void run_test_par_ilut() { auto par_ilut_handle = kh.get_par_ilut_handle(); par_ilut_handle->set_async_update(false); - par_ilut_handle->set_fill_in_limit(0.75); // Allocate L and U CRS views as outputs RowMapType L_row_map("L_row_map", nrows + 1); From 72a8d71fd2c8e6dd5014f6f039d0481f6250bf8e Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 18 Oct 2023 14:43:27 -0600 Subject: [PATCH 011/326] bhalf_t fix for isnan function --- common/src/Kokkos_ArithTraits.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 17296185e7..75c0951e10 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -1336,9 +1336,13 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } +#else +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ARITHTRAITS_HALF_FP(KOKKOS_FUNCTION) #else KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) #endif +#endif }; #endif // #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT From 6ec77dec1b63537732456c3312196ce3ed92aac0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 17 Oct 2023 10:17:24 -0600 Subject: [PATCH 012/326] HIP: since Kokkos has moved it out of experimental we should clean up Just reflecting the move of HIP and HIPSpace out of experimental so that we do not get deprecation warning and even failures down the road. This was really done in Kokkos Core 4.0.0 so it is time to catch up... --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 2 +- batched/dense/src/KokkosBatched_Vector.hpp | 16 ++-- blas/impl/KokkosBlas3_gemm_impl.hpp | 2 +- blas/impl/KokkosBlas3_gemm_spec.hpp | 2 +- blas/src/KokkosBlas2_gemv.hpp | 2 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 12 +-- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 80 +++++++++---------- blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 16 ++-- cmake/KokkosKernels_config.h.in | 4 +- cmake/kokkoskernels_eti_devices.cmake | 8 +- common/src/KokkosKernels_ExecSpaceUtils.hpp | 18 ++--- common/src/KokkosKernels_default_types.hpp | 2 +- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 4 +- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 4 +- .../blas1/KokkosBlas_dot_mv_perf_test.cpp | 2 +- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 2 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 4 +- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 2 +- ...s3_gemm_standalone_perf_test_benchmark.cpp | 2 +- perf_test/graph/KokkosGraph_color.cpp | 4 +- perf_test/graph/KokkosGraph_color_d2.cpp | 4 +- perf_test/graph/KokkosGraph_mis_d2.cpp | 2 +- perf_test/sparse/KokkosSparse_pcg.cpp | 2 +- sparse/impl/KokkosSparse_spmv_impl.hpp | 4 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 4 +- sparse/src/KokkosSparse_CrsMatrix.hpp | 2 +- sparse/src/KokkosSparse_spgemm_handle.hpp | 2 +- sparse/src/KokkosSparse_spmv.hpp | 2 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 12 +-- test_common/Test_HIP.hpp | 2 +- 30 files changed, 112 insertions(+), 112 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index f413ba612c..0c4985e136 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -50,7 +50,7 @@ constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { #if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) template <> constexpr KOKKOS_INLINE_FUNCTION int -kk_gemm_dbl_buf_tile_k() { +kk_gemm_dbl_buf_tile_k() { return 16; } #endif diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index 23fd62655a..0183cc66c1 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -120,21 +120,21 @@ struct DefaultVectorLength, Kokkos::CudaUVMSpace> { #if defined(KOKKOS_ENABLE_HIP) template <> -struct DefaultVectorLength { +struct DefaultVectorLength { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength { +struct DefaultVectorLength { enum : int { value = 16 }; }; template <> struct DefaultVectorLength, - Kokkos::Experimental::HIPSpace> { + Kokkos::HIPSpace> { enum : int { value = 16 }; }; template <> struct DefaultVectorLength, - Kokkos::Experimental::HIPSpace> { + Kokkos::HIPSpace> { enum : int { value = 16 }; }; #endif @@ -189,21 +189,21 @@ struct DefaultInternalVectorLength, #if defined(KOKKOS_ENABLE_HIP) template <> -struct DefaultInternalVectorLength { +struct DefaultInternalVectorLength { enum : int { value = 8 }; }; template <> -struct DefaultInternalVectorLength { +struct DefaultInternalVectorLength { enum : int { value = 4 }; }; template <> struct DefaultInternalVectorLength, - Kokkos::Experimental::HIPSpace> { + Kokkos::HIPSpace> { enum : int { value = 4 }; }; template <> struct DefaultInternalVectorLength, - Kokkos::Experimental::HIPSpace> { + Kokkos::HIPSpace> { enum : int { value = 2 }; }; #endif diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index 4f3e62f343..ba8aff41ad 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -49,7 +49,7 @@ struct impl_gemm_choose_copy_layout { #ifdef KOKKOS_ENABLE_HIP template -struct impl_gemm_choose_copy_layout { using type = LayoutA; }; diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index c340a41fc1..367a8dad3f 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -192,7 +192,7 @@ struct GEMM { team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 40ac9db249..90a1aeab8b 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -151,7 +151,7 @@ void gemv(const ExecutionSpace& space, const char trans[], std::is_same::value && std::is_same::value); + Kokkos::HIPSpace>::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS useFallback = useFallback || (tolower(*trans) == 'c' && diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 1496eee020..728f93223d 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -132,16 +132,16 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, struct gemv_tpl_spec_avail< \ ExecSpace, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 894ce884ee..6a5aa14bd2 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -553,26 +553,26 @@ namespace Impl { struct GEMV< \ ExecSpace, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef double SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ \ @@ -600,26 +600,26 @@ namespace Impl { struct GEMV< \ ExecSpace, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef float SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ \ @@ -647,26 +647,26 @@ namespace Impl { struct GEMV< \ ExecSpace, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ \ @@ -699,26 +699,26 @@ namespace Impl { struct GEMV< \ ExecSpace, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ \ @@ -746,40 +746,40 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 69146baf4f..5eff0e50e7 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -164,26 +164,26 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, }; KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 22b7a196fc..b8b66fffbb 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -49,14 +49,14 @@ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_CUDA #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE -/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +/* Whether to build kernels for execution space Kokkos::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE /* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE -/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +/* Whether to build kernels for execution space Kokkos::Experimental::OpenMPTarget */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMPTARGET #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_OPENMPTARGETSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index d223e00171..8c6cb540ae 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -12,7 +12,7 @@ SET(EXEC_SPACES EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) -SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) +SET(EXECSPACE_HIP_CPP_TYPE Kokkos::HIP) SET(EXECSPACE_SYCL_CPP_TYPE Kokkos::Experimental::SYCL) SET(EXECSPACE_OPENMPTARGET_CPP_TYPE Kokkos::Experimental::OpenMPTarget) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) @@ -31,7 +31,7 @@ SET(MEM_SPACES ) SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) -SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::HIPSpace) SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) SET(MEMSPACE_OPENMPTARGETSPACE_CPP_TYPE Kokkos::Experimental::OpenMPTargetSpace) @@ -77,13 +77,13 @@ IF(KOKKOS_ENABLE_HIP) INST_EXECSPACE_HIP ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the execution space Kokkos::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HIPSPACE ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the memory space Kokkos::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index eb629f9e0c..6de3cb53c4 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -66,7 +66,7 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) { + if (std::is_same::value) { exec_space = Exec_HIP; } #endif @@ -99,7 +99,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { #ifdef KOKKOS_ENABLE_HIP template <> constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_gpu_exec_space() { +kk_is_gpu_exec_space() { return true; } #endif @@ -208,16 +208,16 @@ inline void kk_get_free_total_memory( #ifdef KOKKOS_ENABLE_HIP template <> -inline void kk_get_free_total_memory( +inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem, int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); free_mem /= n_streams; total_mem /= n_streams; } template <> -inline void kk_get_free_total_memory( +inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, total_mem, + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif @@ -405,13 +405,13 @@ struct SpaceInstance { #ifdef KOKKOS_ENABLE_HIP template <> -struct SpaceInstance { - static Kokkos::Experimental::HIP create() { +struct SpaceInstance { + static Kokkos::HIP create() { hipStream_t stream; KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); - return Kokkos::Experimental::HIP(stream); + return Kokkos::HIP(stream); } - static void destroy(Kokkos::Experimental::HIP& space) { + static void destroy(Kokkos::HIP& space) { hipStream_t stream = space.hip_stream(); KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream)); } diff --git a/common/src/KokkosKernels_default_types.hpp b/common/src/KokkosKernels_default_types.hpp index 30ca52e300..1da965a082 100644 --- a/common/src/KokkosKernels_default_types.hpp +++ b/common/src/KokkosKernels_default_types.hpp @@ -62,7 +62,7 @@ using default_scalar = double; #if defined(KOKKOS_ENABLE_CUDA) using default_device = Kokkos::Cuda; #elif defined(KOKKOS_ENABLE_HIP) -using default_device = Kokkos::Experimental::HIP; +using default_device = Kokkos::HIP; #elif defined(KOKKOS_ENABLE_OPENMPTARGET) using default_device = Kokkos::Experimental::OpenMPTarget; #elif defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index 314439b6c0..71bf2c042f 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -117,7 +117,7 @@ struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct FactorizeModeAndAlgo +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #endif @@ -156,7 +156,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 3f15ca0b2d..80238363b0 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -127,7 +127,7 @@ struct InverseDiagonalsModeAndAlgo #if defined(KOKKOS_ENABLE_HIP) template <> -struct InverseDiagonalsModeAndAlgo +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif @@ -166,7 +166,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 499a701c13..7bc25a5704 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -210,7 +210,7 @@ int main(int argc, char** argv) { } if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) - run(params.m, params.n, params.repeat); + run(params.m, params.n, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index 89680d20f9..54ae35ac7a 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -207,7 +207,7 @@ int main(int argc, char** argv) { if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) - run(params.m, params.repeat); + run(params.m, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index 564db4af2e..9bd32ecea1 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -211,10 +211,10 @@ int main(int argc, char** argv) { if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) if (params.layoutLeft) - run(params.m, params.n, + run(params.m, params.n, params.repeat); else - run(params.m, params.n, + run(params.m, params.n, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 962328eb95..8f25026ba9 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -188,7 +188,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - run(params); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index 32d91e6b33..d617ffcdf3 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -180,7 +180,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - run(params); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index 57f241d7b1..fc7fb3a19f 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -632,8 +632,8 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { KokkosKernels::Experiment::run_multi_mem_experiment< - size_type, idx, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + size_type, idx, Kokkos::HIP, + Kokkos::HIPSpace, Kokkos::HIPSpace>(params); } #endif diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index f05040c083..840b29415a 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -708,8 +708,8 @@ int main(int argc, char* argv[]) { if (params.use_hip) { if (!use_multi_mem) { KokkosKernels::Experiment::experiment_driver< - kk_size_type, kk_lno_t, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace>(params); + kk_size_type, kk_lno_t, Kokkos::HIP, + Kokkos::HIPSpace>(params); } } #endif diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index a97cbb4d81..8f7d6a1983 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -316,7 +316,7 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { - run_mis2(params); + run_mis2(params); run = true; } #endif diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 475bfe5f85..ecc155fd34 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -367,7 +367,7 @@ int main(int argc, char **argv) { #endif #if defined(KOKKOS_ENABLE_HIP) if (cmdline[CMD_USE_HIP]) - run_pcg(cmdline, mtx_file); + run_pcg(cmdline, mtx_file); #endif } Kokkos::finalize(); diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index c3df57c65f..c1b4bb0193 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -205,7 +205,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) + if (std::is_same::value) max_vector_length = 64; #endif @@ -594,7 +594,7 @@ static void spmv_beta_transpose(const execution_space& exec, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) + if (std::is_same::value) max_vector_length = 64; #endif while ((vector_length * 2 * 3 <= NNZPerRow) && diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index b14c9be072..ee7e83b554 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2734,8 +2734,8 @@ struct ReturnRangePolicyType { #endif #ifdef KOKKOS_ENABLE_HIP template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; static inline PolicyType get_policy(int nt, int ts) { return PolicyType(nt, ts); diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index be3ac80343..7070172a1f 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -63,7 +63,7 @@ inline int RowsPerThread(const int /*NNZPerRow*/) { #endif #ifdef KOKKOS_ENABLE_HIP template <> -inline int RowsPerThread(const int /*NNZPerRow*/) { +inline int RowsPerThread(const int /*NNZPerRow*/) { return 1; } #endif diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 1106d300c8..a95c828c96 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -661,7 +661,7 @@ class SPGEMMHandle { #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) { + if (std::is_same::value) { this->algorithm_type = SPGEMM_KK; #ifdef VERBOSE std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 0658adbccf..bd038813d1 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -203,7 +203,7 @@ void spmv(const ExecutionSpace& space, #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE if (std::is_same::value) { + Kokkos::HIPSpace>::value) { useFallback = useFallback || (mode[0] != NoTranspose[0]); } #endif diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 1df4a7e5c9..66bc014307 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -188,18 +188,18 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, struct spmv_tpl_spec_avail< \ Kokkos::HIP, \ KokkosSparse::CrsMatrix, \ + Kokkos::Device, \ Kokkos::MemoryTraits, \ const rocsparse_int>, \ Kokkos::View< \ const SCALAR*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>> { \ enum : bool { value = true }; \ }; diff --git a/test_common/Test_HIP.hpp b/test_common/Test_HIP.hpp index 6d619d1378..c9e02698c5 100644 --- a/test_common/Test_HIP.hpp +++ b/test_common/Test_HIP.hpp @@ -32,6 +32,6 @@ class hip : public ::testing::Test { }; #define TestCategory hip -#define TestDevice Kokkos::Experimental::HIP +#define TestDevice Kokkos::HIP #endif // TEST_HIP_HPP From 1537c4fb9f8d5681e25ff4f7fdaa6a0481ce9735 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 17 Oct 2023 10:27:37 -0600 Subject: [PATCH 013/326] Applying clang-format --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 3 +- batched/dense/src/KokkosBatched_Vector.hpp | 12 +- blas/impl/KokkosBlas3_gemm_impl.hpp | 3 +- blas/src/KokkosBlas2_gemv.hpp | 10 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 31 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 430 +++++++++--------- blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 12 +- common/src/KokkosKernels_ExecSpaceUtils.hpp | 15 +- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 6 +- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 3 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 6 +- perf_test/graph/KokkosGraph_color.cpp | 4 +- perf_test/graph/KokkosGraph_color_d2.cpp | 3 +- perf_test/sparse/KokkosSparse_pcg.cpp | 3 +- sparse/impl/KokkosSparse_spmv_impl.hpp | 6 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 35 +- 16 files changed, 268 insertions(+), 314 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 0c4985e136..f70fa6b963 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -49,8 +49,7 @@ constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { // buffering algorithm by a factor of 2. #if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) template <> -constexpr KOKKOS_INLINE_FUNCTION int -kk_gemm_dbl_buf_tile_k() { +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { return 16; } #endif diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index 0183cc66c1..71d159cb03 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -128,13 +128,11 @@ struct DefaultVectorLength { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength, - Kokkos::HIPSpace> { +struct DefaultVectorLength, Kokkos::HIPSpace> { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength, - Kokkos::HIPSpace> { +struct DefaultVectorLength, Kokkos::HIPSpace> { enum : int { value = 16 }; }; #endif @@ -197,13 +195,11 @@ struct DefaultInternalVectorLength { enum : int { value = 4 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::HIPSpace> { +struct DefaultInternalVectorLength, Kokkos::HIPSpace> { enum : int { value = 4 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::HIPSpace> { +struct DefaultInternalVectorLength, Kokkos::HIPSpace> { enum : int { value = 2 }; }; #endif diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index ba8aff41ad..1a0ab46bb3 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -49,8 +49,7 @@ struct impl_gemm_choose_copy_layout { #ifdef KOKKOS_ENABLE_HIP template -struct impl_gemm_choose_copy_layout { +struct impl_gemm_choose_copy_layout { using type = LayoutA; }; #endif diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 90a1aeab8b..614b48d47a 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -147,11 +147,11 @@ void gemv(const ExecutionSpace& space, const char trans[], #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS useFallback = - useFallback || (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback || + (tolower(*trans) == 'c' && + std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS useFallback = useFallback || (tolower(*trans) == 'c' && diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 728f93223d..70b5560f6e 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -127,23 +127,20 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 6a5aa14bd2..304dd349bf 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -548,239 +548,219 @@ namespace Impl { transa = rocblas_operation_conjugate_transpose; \ } -#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ + X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ + X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ + s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ + s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) - -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) - -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) - -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 5eff0e50e7..8e96898b10 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -168,22 +168,18 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::HIPSpace) + Kokkos::LayoutRight, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::HIPSpace) + Kokkos::LayoutRight, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index 6de3cb53c4..2ec09f4069 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -98,8 +98,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { #ifdef KOKKOS_ENABLE_HIP template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_gpu_exec_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { return true; } #endif @@ -208,17 +207,17 @@ inline void kk_get_free_total_memory( #ifdef KOKKOS_ENABLE_HIP template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem, + int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); free_mem /= n_streams; total_mem /= n_streams; } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, total_mem, - 1); +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index 71bf2c042f..f3eb0dd8ac 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -117,8 +117,7 @@ struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct FactorizeModeAndAlgo - : FactorizeModeAndAlgoDeviceImpl {}; +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #endif template @@ -156,8 +155,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo - : SolveModeAndAlgoDeviceImpl {}; +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif template diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 80238363b0..67a141578e 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -166,8 +166,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo - : SolveModeAndAlgoDeviceImpl {}; +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index 9bd32ecea1..5dfecd9015 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -211,11 +211,9 @@ int main(int argc, char** argv) { if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) if (params.layoutLeft) - run(params.m, params.n, - params.repeat); + run(params.m, params.n, params.repeat); else - run(params.m, params.n, - params.repeat); + run(params.m, params.n, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index fc7fb3a19f..134611739a 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -632,8 +632,8 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { KokkosKernels::Experiment::run_multi_mem_experiment< - size_type, idx, Kokkos::HIP, - Kokkos::HIPSpace, Kokkos::HIPSpace>(params); + size_type, idx, Kokkos::HIP, Kokkos::HIPSpace, Kokkos::HIPSpace>( + params); } #endif diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index 840b29415a..e4331dd542 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -708,8 +708,7 @@ int main(int argc, char* argv[]) { if (params.use_hip) { if (!use_multi_mem) { KokkosKernels::Experiment::experiment_driver< - kk_size_type, kk_lno_t, Kokkos::HIP, - Kokkos::HIPSpace>(params); + kk_size_type, kk_lno_t, Kokkos::HIP, Kokkos::HIPSpace>(params); } } #endif diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index ecc155fd34..9825f7c90d 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -366,8 +366,7 @@ int main(int argc, char **argv) { if (cmdline[CMD_USE_CUDA]) run_pcg(cmdline, mtx_file); #endif #if defined(KOKKOS_ENABLE_HIP) - if (cmdline[CMD_USE_HIP]) - run_pcg(cmdline, mtx_file); + if (cmdline[CMD_USE_HIP]) run_pcg(cmdline, mtx_file); #endif } Kokkos::finalize(); diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index c1b4bb0193..060a9d66c7 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -205,8 +205,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) - max_vector_length = 64; + if (std::is_same::value) max_vector_length = 64; #endif if (vector_length < 1) { @@ -594,8 +593,7 @@ static void spmv_beta_transpose(const execution_space& exec, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) - max_vector_length = 64; + if (std::is_same::value) max_vector_length = 64; #endif while ((vector_length * 2 * 3 <= NNZPerRow) && (vector_length < max_vector_length)) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 66bc014307..01a0ce1373 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -183,25 +183,22 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ - template <> \ - struct spmv_tpl_spec_avail< \ - Kokkos::HIP, \ - KokkosSparse::CrsMatrix, \ - Kokkos::MemoryTraits, \ - const rocsparse_int>, \ - Kokkos::View< \ - const SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::HIP, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const rocsparse_int>, \ + Kokkos::View< \ + const SCALAR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(double, Kokkos::LayoutLeft) From a3dd32b0203f0575e08a16011e50d4d0d51b7791 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 20 Oct 2023 10:57:14 -0600 Subject: [PATCH 014/326] Sparse: fix cusparse spgemm hang properly The issue is fixed by disabling the TPL in spec_avail when a problematic version of CUDA/cuSPARSE is being used. --- .../KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp | 2 ++ .../KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp | 2 ++ sparse/unit_test/Test_Sparse_bspgemm.hpp | 9 --------- sparse/unit_test/Test_Sparse_spgemm.hpp | 10 ---------- 4 files changed, 4 insertions(+), 19 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index e144b53162..1d5d66f416 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -82,10 +82,12 @@ struct spgemm_numeric_tpl_spec_avail { SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \ SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace) +#if (CUDA_VERSION >= 11000) && (CUDA_VERSION < 11040) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(float) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(double) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex) +#endif #endif diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index b8c545ffe2..1b3ef2d2cb 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -67,11 +67,13 @@ struct spgemm_symbolic_tpl_spec_avail { SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \ SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace) +#if (CUDA_VERSION >= 11000) && (CUDA_VERSION < 11040) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(float) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(double) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex) #endif +#endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #define SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(SCALAR) \ diff --git a/sparse/unit_test/Test_Sparse_bspgemm.hpp b/sparse/unit_test/Test_Sparse_bspgemm.hpp index d3c3a6134f..58a2a18b8a 100644 --- a/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -159,15 +159,6 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, return; } #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL -#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUSPARSE_VERSION < 11600) - { - std::cerr - << "TEST SKIPPED: See " - "https://github.com/kokkos/kokkos-kernels/issues/1965 for details." - << std::endl; - return; - } -#endif using namespace Test; // device::execution_space::initialize(); // device::execution_space::print_configuration(std::cout); diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index 7e655d4c0c..bd1e68c370 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -486,16 +486,6 @@ void test_issue402() { template void test_issue1738() { -#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUDA_VERSION >= 11000) && \ - (CUDA_VERSION < 11040) - { - std::cerr - << "TEST SKIPPED: See " - "https://github.com/kokkos/kokkos-kernels/issues/1777 for details." - << std::endl; - return; - } -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL // Make sure that std::invalid_argument is thrown if you: // - call numeric where an input matrix's entries have changed. // - try to reuse an spgemm handle by calling symbolic with new input From 668034755812220897aae0f9b940f6ff981edcab Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 20 Oct 2023 14:31:38 -0600 Subject: [PATCH 015/326] Sparse: fix logic for bad cursparse spgemm version. Just inverted the logic statement to enable the TPL when it is known to work correctly. --- sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp | 2 +- sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index 1d5d66f416..517e104988 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -82,7 +82,7 @@ struct spgemm_numeric_tpl_spec_avail { SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \ SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace) -#if (CUDA_VERSION >= 11000) && (CUDA_VERSION < 11040) +#if (CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(float) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(double) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex) diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 1b3ef2d2cb..41e8802214 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -67,7 +67,7 @@ struct spgemm_symbolic_tpl_spec_avail { SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \ SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace) -#if (CUDA_VERSION >= 11000) && (CUDA_VERSION < 11040) +#if (CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(float) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(double) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex) From 4c481e1d5d8f3e63b098842c20ce60c5d4f69a29 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 4 Jun 2023 21:09:50 -0600 Subject: [PATCH 016/326] Improvements on the unification attempt logic for axpby(), including new tests --- blas/impl/KokkosBlas1_axpby_impl.hpp | 480 ++-- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 1346 ++++++---- blas/impl/KokkosBlas1_axpby_spec.hpp | 216 +- ...Blas1_axpby_unification_attempt_traits.hpp | 1009 ++++++++ blas/src/KokkosBlas1_axpby.hpp | 337 ++- blas/unit_test/Test_Blas.hpp | 1 + blas/unit_test/Test_Blas1_axpby.hpp | 2 - .../Test_Blas1_axpby_unification.hpp | 2204 +++++++++++++++++ common/src/KokkosKernels_helpers.hpp | 10 +- 9 files changed, 4664 insertions(+), 941 deletions(-) create mode 100644 blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp create mode 100644 blas/unit_test/Test_Blas1_axpby_unification.hpp diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index 4e468b0e56..8b70cece42 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -21,12 +21,24 @@ #include "Kokkos_InnerProductSpaceTraits.hpp" #ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY -#define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY 2 +#define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY 3 #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY namespace KokkosBlas { namespace Impl { +template +constexpr typename std::enable_if, int>::type +axpbyVarExtent(T& v) { + return v.extent(0); +} + +template +constexpr typename std::enable_if, int>::type +axpbyVarExtent(T&) { + return 0; +} + // // axpby // @@ -61,9 +73,9 @@ struct Axpby_Functor { AV m_a; BV m_b; - Axpby_Functor(const XV& x, const YV& y, const AV& a, const BV& b, + Axpby_Functor(const XV& x, const YV& y, const AV& av, const BV& bv, const SizeType startingColumn) - : m_x(x), m_y(y), m_a(a), m_b(b) { + : m_x(x), m_y(y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Functor: X is not a Kokkos::View."); @@ -81,12 +93,15 @@ struct Axpby_Functor { static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby_Functor: " "XV and YV must have rank 1."); - if (startingColumn != 0) { - m_a = Kokkos::subview( - a, std::make_pair(startingColumn, SizeType(a.extent(0)))); - m_b = Kokkos::subview( - b, std::make_pair(startingColumn, SizeType(b.extent(0)))); + if (axpbyVarExtent(m_a) > 1) { + m_a = Kokkos::subview( + av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + } + if (axpbyVarExtent(m_b) > 1) { + m_b = Kokkos::subview( + bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + } } } @@ -98,68 +113,77 @@ struct Axpby_Functor { #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY <= 2 - if (scalar_x == 0 && scalar_y == 0) { - m_y(i) = ATS::zero(); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = m_b(0) * m_y(i); - } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = m_a(0) * m_x(i); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); + if (scalar_x == 0) { + if (scalar_y == 0) { + m_y(i) = ATS::zero(); + } else { // (scalar_y == 2) + m_y(i) = m_b(0) * m_y(i); + } + } else { // (scalar_x == 2) + if (scalar_y == 0) { + m_y(i) = m_a(0) * m_x(i); + } else { // (scalar_y == 2) + m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); + } } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (scalar_x == 0 && scalar_y == 0) { - m_y(i) = ATS::zero(); - } - if (scalar_x == 0 && scalar_y == -1) { - m_y(i) = -m_y(i); - } - if (scalar_x == 0 && scalar_y == 1) { - return; // m_y(i) = m_y(i); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = m_b(0) * m_y(i); - } - if (scalar_x == -1 && scalar_y == 0) { - m_y(i) = -m_x(i); - } - if (scalar_x == -1 && scalar_y == -1) { - m_y(i) = -m_x(i) - m_y(i); - } - if (scalar_x == -1 && scalar_y == 1) { - m_y(i) = -m_x(i) + m_y(i); - } - if (scalar_x == -1 && scalar_y == 2) { - m_y(i) = -m_x(i) + m_b(0) * m_y(i); - } - if (scalar_x == 1 && scalar_y == 0) { - m_y(i) = m_x(i); - } - if (scalar_x == 1 && scalar_y == -1) { - m_y(i) = m_x(i) - m_y(i); - } - if (scalar_x == 1 && scalar_y == 1) { - m_y(i) = m_x(i) + m_y(i); - } - if (scalar_x == 1 && scalar_y == 2) { - m_y(i) = m_x(i) + m_b(0) * m_y(i); - } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = m_a(0) * m_x(i); - } - if (scalar_x == 2 && scalar_y == -1) { - m_y(i) = m_a(0) * m_x(i) - m_y(i); - } - if (scalar_x == 2 && scalar_y == 1) { - m_y(i) = m_a(0) * m_x(i) + m_y(i); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + m_y(i) = ATS::zero(); + } else if (scalar_y == -1) { + m_y(i) = -m_y(i); + } else if (scalar_y == 1) { + // Nothing to do: m_y(i) = m_y(i); + } else { // (scalar_y == 2) + m_y(i) = m_b(0) * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + m_y(i) = -m_x(i); + } else if (scalar_y == -1) { + m_y(i) = -m_x(i) - m_y(i); + } else if (scalar_y == 1) { + m_y(i) = -m_x(i) + m_y(i); + } else { // (scalar_y == 2) + m_y(i) = -m_x(i) + m_b(0) * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + m_y(i) = m_x(i); + } else if (scalar_y == -1) { + m_y(i) = m_x(i) - m_y(i); + } else if (scalar_y == 1) { + m_y(i) = m_x(i) + m_y(i); + } else { // (scalar_y == 2) + m_y(i) = m_x(i) + m_b(0) * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { + m_y(i) = m_a(0) * m_x(i); + } else if (scalar_y == -1) { + m_y(i) = m_a(0) * m_x(i) - m_y(i); + } else if (scalar_y == 1) { + m_y(i) = m_a(0) * m_x(i) + m_y(i); + } else { // (scalar_y == 2) + m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); + } } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY @@ -227,69 +251,77 @@ struct Axpby_Functor(ATS::zero()); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = static_cast(m_b * m_y(i)); + if (scalar_x == 0) { + if (scalar_y == 0) { + m_y(i) = static_cast(ATS::zero()); + } else { // (scalar_y == 2) + m_y(i) = static_cast(m_b * m_y(i)); + } + } else { // (scalar_x == 2) + if (scalar_y == 0) { + m_y(i) = static_cast(m_a * m_x(i)); + } else { // (scalar_y == 2) + m_y(i) = static_cast(m_a * m_x(i) + + m_b * m_y(i)); + } } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = static_cast(m_a * m_x(i)); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = static_cast(m_a * m_x(i) + - m_b * m_y(i)); - } - #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (scalar_x == 0 && scalar_y == 0) { - m_y(i) = ATS::zero(); - } - if (scalar_x == 0 && scalar_y == -1) { - m_y(i) = -m_y(i); - } - if (scalar_x == 0 && scalar_y == 1) { - return; // m_y(i) = m_y(i); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = m_b * m_y(i); - } - if (scalar_x == -1 && scalar_y == 0) { - m_y(i) = -m_x(i); - } - if (scalar_x == -1 && scalar_y == -1) { - m_y(i) = -m_x(i) - m_y(i); - } - if (scalar_x == -1 && scalar_y == 1) { - m_y(i) = -m_x(i) + m_y(i); - } - if (scalar_x == -1 && scalar_y == 2) { - m_y(i) = -m_x(i) + m_b * m_y(i); - } - if (scalar_x == 1 && scalar_y == 0) { - m_y(i) = m_x(i); - } - if (scalar_x == 1 && scalar_y == -1) { - m_y(i) = m_x(i) - m_y(i); - } - if (scalar_x == 1 && scalar_y == 1) { - m_y(i) = m_x(i) + m_y(i); - } - if (scalar_x == 1 && scalar_y == 2) { - m_y(i) = m_x(i) + m_b * m_y(i); - } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = m_a * m_x(i); - } - if (scalar_x == 2 && scalar_y == -1) { - m_y(i) = m_a * m_x(i) - m_y(i); - } - if (scalar_x == 2 && scalar_y == 1) { - m_y(i) = m_a * m_x(i) + m_y(i); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = m_a * m_x(i) + m_b * m_y(i); + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + m_y(i) = ATS::zero(); + } else if (scalar_y == -1) { + m_y(i) = -m_y(i); + } else if (scalar_y == 1) { + // Nothing to do: m_y(i) = m_y(i); + } else { // (scalar_y == 2) + m_y(i) = m_b * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + m_y(i) = -m_x(i); + } else if (scalar_y == -1) { + m_y(i) = -m_x(i) - m_y(i); + } else if (scalar_y == 1) { + m_y(i) = -m_x(i) + m_y(i); + } else { // (scalar_y == 2) + m_y(i) = -m_x(i) + m_b * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + m_y(i) = m_x(i); + } else if (scalar_y == -1) { + m_y(i) = m_x(i) - m_y(i); + } else if (scalar_y == 1) { + m_y(i) = m_x(i) + m_y(i); + } else { // (scalar_y == 2) + m_y(i) = m_x(i) + m_b * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { + m_y(i) = m_a * m_x(i); + } else if (scalar_y == -1) { + m_y(i) = m_a * m_x(i) - m_y(i); + } else if (scalar_y == 1) { + m_y(i) = m_a * m_x(i) + m_y(i); + } else { // (scalar_y == 2) + m_y(i) = m_a * m_x(i) + m_b * m_y(i); + } } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY @@ -297,8 +329,9 @@ struct Axpby_Functor void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, const BV& bv, const YV& y, const SizeType startingColumn, - int a = 2, int b = 2) { + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: X is not a Kokkos::View."); @@ -328,115 +361,104 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if (a == 0 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op); - return; - } - + // **************************************************************** + // Possibilities with 'scalar_x == 0' + // **************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op); + } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 0 && b == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op); - return; - } - if (a == 0 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op); - return; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - if (a == 0 && b == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op); - return; + else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op); + } +#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 + else { // (scalar_y == 2) + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op); + } } - #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // a == -1 - if (a == -1 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S4", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == -1' + // **************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S4", policy, op); + } else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S5", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op); + } else { // (scalar_y == 2) + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op); + } } - if (a == -1 && b == -1) { - Axpby_Functor op(x, y, av, bv, + // **************************************************************** + // Possibilities with 'scalar_x == 1' + // **************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S5", policy, op); - return; - } - if (a == -1 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op); - return; - } - if (a == -1 && b == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op); - return; - } - // a == 1 - if (a == 1 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S8", policy, op); - return; - } - if (a == 1 && b == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S9", policy, op); - return; - } - if (a == 1 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op); - return; - } - if (a == 1 && b == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op); - return; + Kokkos::parallel_for("KokkosBlas::Axpby::S8", policy, op); + } else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S9", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op); + } else { // (scalar_y == 2) + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op); + } } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a == 2 - if (a == 2 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op); - return; - } - + // **************************************************************** + // Possibilities with 'scalar_x == 2' + // **************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op); + } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 2 && b == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op); - return; - } - if (a == 2 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op); - return; + else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op); + } +#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 + else { // (scalar_y == 2) + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a and b arbitrary (not -1, 0, or 1) - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 32653b9cce..52ccc8de60 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -52,8 +52,8 @@ struct Axpby_MV_Functor { AV m_a; BV m_b; - Axpby_MV_Functor(const XMV& X, const YMV& Y, const AV& a, const BV& b) - : numCols(X.extent(1)), m_x(X), m_y(Y), m_a(a), m_b(b) { + Axpby_MV_Functor(const XMV& X, const YMV& Y, const AV& av, const BV& bv) + : numCols(X.extent(1)), m_x(X), m_y(Y), m_a(av), m_b(bv) { // XMV and YMV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -92,175 +92,292 @@ struct Axpby_MV_Functor { // scalar_x and scalar_y are compile-time constants (since they // are template parameters), so the compiler should evaluate these // branches at compile time. - if (scalar_x == 0 && scalar_y == 0) { + + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = ATS::zero(); - } - } - if (scalar_x == 0 && scalar_y == -1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = ATS::zero(); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_y(i, k); - } - } - if (scalar_x == 0 && scalar_y == 1) { - return; // Y(i,j) := Y(i,j) - } - if (scalar_x == 0 && scalar_y == 2) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_y(i, k); + } + } else if (scalar_y == 1) { + // Nothing to do: Y(i,j) := Y(i,j) + } else { // if (scalar_y == 2) { + if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_b(k) * m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 0) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k); + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == -1 && scalar_y == -1) { + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 2) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) - m_y(i, k); + } + } else if (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 0) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) + m_y(i, k); + } + } else { // if (scalar_y == 2) { + if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k); - } - } - if (scalar_x == 1 && scalar_y == -1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k) - m_y(i, k); + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 1 && scalar_y == 1) { + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 2) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 0) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) - m_y(i, k); + } + } else if (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k); - } - } - if (scalar_x == 2 && scalar_y == -1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) + m_y(i, k); + } + } else { // if (scalar_y == 2) { + if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 2 && scalar_y == 2) { + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else { // if (scalar_x == 2) { + if (scalar_y == 0) { + if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); - } - } - } + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } + } else if (scalar_y == -1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) - m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); + } + } + } else if (scalar_y == 1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); + } + } + } else { // if (scalar_y == 2) { + if (m_a.extent(0) == 1) { + if (m_b.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } else { + if (m_b.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } + } // if (scalar_y == ...) else if + } // if (scalar_x == ...) else if + } // void operator() }; // Variant of Axpby_MV_Functor, where a and b are scalars. @@ -268,7 +385,7 @@ struct Axpby_MV_Functor { // // 1. Y(i,j) = alpha*X(i,j) + beta*Y(i,j) for alpha,beta in -1,0,1 // 2. Y(i,j) = a*X(i,j) + beta*Y(i,j) for beta in -1,0,1 -// 3. Y(i,j) = alpha*X(i,j) + beta*Y(i,j) for alpha in -1,0,1 +// 3. Y(i,j) = alpha*X(i,j) + b*Y(i,j) for alpha in -1,0,1 // 4. Y(i,j) = a*X(i,j) + b*Y(i,j) // // The template parameters scalar_x and scalar_y correspond to alpha @@ -322,172 +439,181 @@ struct Axpby_MV_Functor::value, "KokkosBlas::Impl::" "Axpby_MV_Unroll_Functor: a is not a Kokkos::View."); @@ -540,10 +666,14 @@ struct Axpby_MV_Unroll_Functor { "BV must have rank 1."); if (startingColumn != 0) { - m_a = Kokkos::subview( - a, std::make_pair(startingColumn, SizeType(a.extent(0)))); - m_b = Kokkos::subview( - b, std::make_pair(startingColumn, SizeType(b.extent(0)))); + if (axpbyVarExtent(m_a) > 1) { + m_a = Kokkos::subview( + av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + } + if (axpbyVarExtent(m_b) > 1) { + m_b = Kokkos::subview( + bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + } } } @@ -555,162 +685,304 @@ struct Axpby_MV_Unroll_Functor { #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY <= 2 - if (scalar_x == 0 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); - } - } - if (scalar_x == 0 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = ATS::zero(); + } + } else { // (scalar_y == 2) { + if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b(k) * m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 2 && scalar_y == 2) { + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { + if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } + } else { // (scalar_y == 2) + if (m_a.extent(0) == 1) { + if (m_b.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } else { + if (m_b.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } } } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL >= 3 - if (scalar_x == 0 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); - } - } - if (scalar_x == 0 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = ATS::zero(); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_y(i, k); - } - } - if (scalar_x == 0 && scalar_y == 1) { - return; // Y(i,j) := Y(i,j) - } - if (scalar_x == 0 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_y(i, k); + } + } else if (scalar_y == 1) { + // Nothing to do: Y(i,j) := Y(i,j) + } else { // (scalar_y == 2) + if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b(k) * m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == -1 && scalar_y == -1) { + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) - m_y(i, k); + } + } else if (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_y(i, k); + } + } else { // (scalar_y == 2) + if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k); - } - } - if (scalar_x == 1 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) - m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 1 && scalar_y == 1) { + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) - m_y(i, k); + } + } else if (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k); - } - } - if (scalar_x == 2 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_y(i, k); + } + } else { // (scalar_y == 2) + if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 2 && scalar_y == 2) { + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } + } else if (scalar_y == -1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) - m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); + } + } + } else if (scalar_y == 1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); + } + } + } else { // (scalar_y == 2) + if (m_a.extent(0) == 1) { + if (m_b.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } else { + if (m_b.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } } } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY @@ -765,162 +1037,178 @@ struct Axpby_MV_Unroll_Functor 2 - if (scalar_x == 0 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); - } - } - if (scalar_x == 0 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = ATS::zero(); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_y(i, k); - } - } - if (scalar_x == 0 && scalar_y == 1) { - return; // Y(i,j) := Y(i,j) - } - if (scalar_x == 0 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_y(i, k); + } + } else if (scalar_y == 1) { + // Nothing to do: Y(i,j) := Y(i,j) + } else { // (scalar_y == 2) #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b * m_y(i, k); + } } } - if (scalar_x == -1 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k); - } - } - if (scalar_x == -1 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) - m_y(i, k); + } + } else if (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_y(i, k); + } + } else { // (scalar_y == 2) #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_b * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_b * m_y(i, k); + } } } - if (scalar_x == 1 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k); - } - } - if (scalar_x == 1 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) - m_y(i, k); + } + } else if (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_y(i, k); + } + } else { // (scalar_y == 2) #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_b * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_b * m_y(i, k); + } } } - if (scalar_x == 2 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k); - } - } - if (scalar_x == 2 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k); + } + } else if (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k) - m_y(i, k); + } + } else if (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k) + m_y(i, k); + } + } else { // (scalar_y == 2) #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k) + m_b * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k) + m_b * m_y(i, k); + } } } @@ -936,10 +1224,10 @@ struct Axpby_MV_Unroll_Functor void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, - const SizeType startingColumn, int a = 2, int b = 2) { + const SizeType startingColumn, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Unrolled: X is not a Kokkos::View."); @@ -973,115 +1262,104 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if (a == 0 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op); - return; - } - + // **************************************************************** + // Possibilities with 'scalar_x == 0' + // **************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op); + } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 0 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op); - return; - } - if (a == 0 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op); - return; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY - - if (a == 0 && b == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op); - return; + else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op); + } +#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY + else { // (scalar_y == 2) + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op); + } } - #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // a == -1 - if (a == -1 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S4", policy, op); - return; - } - if (a == -1 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S5", policy, op); - return; - } - if (a == -1 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op); - return; - } - if (a == -1 && b == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op); - return; - } - // a == 1 - if (a == 1 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S8", policy, op); - return; - } - if (a == 1 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S9", policy, op); - return; - } - if (a == 1 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == -1' + // **************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S4", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S5", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op); + } else { // (scalar_y == 2) + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op); + } } - if (a == 1 && b == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 1' + // **************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S8", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S9", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op); + } else { // (scalar_y == 2) + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op); + } } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a == 2 - if (a == 2 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op); - return; - } - + // **************************************************************** + // Possibilities with 'scalar_x == 2' + // **************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op); + } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 2 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op); - return; - } - if (a == 2 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op); - return; + else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op); + } +#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 + else { // (scalar_y == 2) + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a and b arbitrary (not -1, 0, or 1) - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op); } // Invoke the "generic" (not unrolled) multivector functor that @@ -1092,10 +1370,10 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // 3. Y(i,j) = a*X(i,j) + b*Y(i,j) for a in -1,0,1 // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // -// a and b come in as integers. The values -1, 0, and 1 correspond to -// the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: a == 2 -// means use av, and b == 2 means use bv. Otherwise, av resp. vb are +// scalar_x and scalar_y come in as integers. The values -1, 0, and 1 +// correspond to the literal values of the coefficients. The value 2 tells the +// functor to use the corresponding vector of coefficients: scalar_x == 2 +// means use av, and scalar_y == 2 means use bv. Otherwise, av resp. bv are // ignored. // // Any literal coefficient of zero has BLAS semantics of ignoring the @@ -1106,7 +1384,8 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, template void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int a = 2, int b = 2) { + const BV& bv, const YMV& y, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Generic: X is not a Kokkos::View."); @@ -1128,99 +1407,88 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if (a == 0 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S16", policy, op); - return; - } - + // **************************************************************** + // Possibilities with 'scalar_x == 0' + // **************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S16", policy, op); + } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 0 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S17", policy, op); - return; - } - if (a == 0 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S18", policy, op); - return; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - if (a == 0 && b == 2) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S19", policy, op); - return; + else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S17", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S18", policy, op); + } +#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 + else { // (scalar_y == 2) + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S19", policy, op); + } } - #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // a == -1 - if (a == -1 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S20", policy, op); - return; - } - if (a == -1 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S21", policy, op); - return; - } - if (a == -1 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S22", policy, op); - return; - } - if (a == -1 && b == 2) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S23", policy, op); - return; - } - // a == 1 - if (a == 1 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S24", policy, op); - return; - } - if (a == 1 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S25", policy, op); - return; - } - if (a == 1 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S26", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == -1' + // **************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S20", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S21", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S22", policy, op); + } else { // (scalar_y == 2) + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S23", policy, op); + } } - if (a == 1 && b == 2) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S27", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 1' + // **************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S24", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S25", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S26", policy, op); + } else { // (scalar_y == 2) + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S27", policy, op); + } } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a == 2 - if (a == 2 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S28", policy, op); - return; - } - + // **************************************************************** + // Possibilities with 'scalar_x == 2' + // **************************************************************** + else { // (scalar_x == 2) + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S28", policy, op); + } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 2 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S29", policy, op); - return; - } - if (a == 2 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S30", policy, op); - return; + else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S29", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S30", policy, op); + } +#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 + else { // (scalar_x == 2) + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S31", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a and b arbitrary (not -1, 0, or 1) - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S31", policy, op); } // Compute any of the following, in a way optimized for X and Y @@ -1231,10 +1499,10 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, // 3. Y(i,j) = a*X(i,j) + b*Y(i,j) for a in -1,0,1 // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // -// a and b come in as integers. The values -1, 0, and 1 correspond to -// the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: a == 2 -// means use av, and b == 2 means use bv. Otherwise, av resp. vb are +// scalar_x and scalar_y come in as integers. The values -1, 0, and 1 +// correspond to the literal values of the coefficients. The value 2 tells the +// functor to use the corresponding vector of coefficients: scalar_x == 2 +// means use av, and scalar_y == 2 means use bv. Otherwise, av resp. bv are // ignored. // // Any literal coefficient of zero has BLAS semantics of ignoring the @@ -1246,7 +1514,8 @@ template struct Axpby_MV_Invoke_Left { static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int a = 2, int b = 2) { + const BV& bv, const YMV& y, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Left: X is not a Kokkos::View."); @@ -1279,7 +1548,7 @@ struct Axpby_MV_Invoke_Left { // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, a, b); + space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); } for (; j + 4 <= numCols; j += 4) { XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 4)); @@ -1289,7 +1558,7 @@ struct Axpby_MV_Invoke_Left { // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, a, b); + space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); } for (; j < numCols; ++j) { auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j); @@ -1301,23 +1570,23 @@ struct Axpby_MV_Invoke_Left { typedef decltype(x_cur) XV; typedef decltype(y_cur) YV; Axpby_Generic( - space, av, x_cur, bv, y_cur, j, a, b); + space, av, x_cur, bv, y_cur, j, scalar_x, scalar_y); } } }; -// Compute any of the following, in a way optimized for X, Y, and R +// Compute any of the following, in a way optimized for X and Y // being LayoutRight: // // 1. Y(i,j) = a*X(i,j) + b*Y(i,j) for a,b in -1,0,1 // 2. Y(i,j) = av(j)*X(i,j) + b*Y(i,j) for b in -1,0,1 -// 3. Y(i,j) = a*X(i,j) + b*Y(i,j) for a in -1,0,1 +// 3. Y(i,j) = a*X(i,j) + bv(j)*Y(i,j) for a in -1,0,1 // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // -// a and b come in as integers. The values -1, 0, and 1 correspond to -// the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: a == 2 -// means use av, and b == 2 means use bv. Otherwise, av resp. vb are +// scalar_x and scalar_y come in as integers. The values -1, 0, and 1 +// correspond to the literal values of the coefficients. The value 2 tells the +// functor to use the corresponding vector of coefficients: scalar_x == 2 +// means use av, and scalar_y == 2 means use bv. Otherwise, av resp. bv are // ignored. // // Any literal coefficient of zero has BLAS semantics of ignoring the @@ -1329,7 +1598,8 @@ template struct Axpby_MV_Invoke_Right { static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int a = 2, int b = 2) { + const BV& bv, const YMV& y, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Right: X is not a Kokkos::View."); @@ -1355,10 +1625,10 @@ struct Axpby_MV_Invoke_Right { typedef decltype(x_0) XV; typedef decltype(y_0) YV; Axpby_Generic( - space, av, x_0, bv, y_0, 0, a, b); + space, av, x_0, bv, y_0, 0, scalar_x, scalar_y); } else { Axpby_MV_Generic( - space, av, x, bv, y, a, b); + space, av, x, bv, y, scalar_x, scalar_y); } } }; diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index da2924c9f3..1de54e07ca 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -56,6 +56,23 @@ struct axpby_eti_spec_avail { Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ + }; \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -82,13 +99,13 @@ struct axpby_eti_spec_avail { template <> \ struct axpby_eti_spec_avail< \ EXEC_SPACE, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ @@ -150,11 +167,17 @@ struct Axpby { }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -// Full specialization for XMV and YMV rank-2 Views. +// ********************************************************************** +// Full specialization for XMV and YMV rank-2 Views: +// --> AV = anything and BV = anything +// +// If axpby() runs at a device with rank-2 XMV and rank-2 YMV, then +// the unification process forces AV = view and BV = view +// ********************************************************************** template struct Axpby { - typedef typename YMV::size_type size_type; + using size_type = typename YMV::size_type; static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y) { @@ -193,49 +216,57 @@ struct Axpby(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - typedef int index_type; - typedef typename std::conditional< + using index_type = int; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } else { - typedef typename XMV::size_type index_type; - typedef typename std::conditional< + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } Kokkos::Profiling::popRegion(); } }; -// Partial specialization for XMV, and YMV rank-2 Views, -// and AV and BV scalars. +// ********************************************************************** +// Partial specialization for XMV and YMV rank-2 Views: +// --> AV = scalar and BV = scalar +// +// If axpby() runs at the host with rank-2 XMV and rank-2 YMV, then +// the unification process _might_ force AV = scalar and BV = scalar +// ********************************************************************** template struct Axpby { - typedef typename XMV::non_const_value_type AV; - typedef typename YMV::non_const_value_type BV; - typedef typename YMV::size_type size_type; - typedef Kokkos::ArithTraits ATA; - typedef Kokkos::ArithTraits ATB; + using AV = typename XMV::non_const_value_type; + using BV = typename YMV::non_const_value_type; + using size_type = typename YMV::size_type; + using ATA = Kokkos::ArithTraits; + using ATB = Kokkos::ArithTraits; static void axpby(const execution_space& space, const AV& alpha, const XMV& X, const BV& beta, const YMV& Y) { @@ -275,9 +306,10 @@ struct Axpby 2 - else { - a = 2; - } + + int b(2); if (beta == ATB::zero()) { b = 0; } @@ -301,43 +332,89 @@ struct Axpby 2 - else { - b = 2; - } if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - typedef int index_type; - typedef typename std::conditional< + using index_type = int; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } else { - typedef typename XMV::size_type index_type; - typedef typename std::conditional< + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } Kokkos::Profiling::popRegion(); } }; -// Partial specialization for XV and YV rank-1 Views, -// and AV and BV scalars. +// ********************************************************************** +// Full specialization for XV and YV rank-1 Views: +// --> AV = anything and BV = anything +// +// If axpby() runs at a device with rank-1 XV and rank-1 YV, then +// the unification process forces AV = view and BV = view +// ********************************************************************** +template +struct Axpby { + using size_type = typename YV::size_type; + + static void axpby(const execution_space& space, const AV& av, const XV& X, + const BV& bv, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); + + size_type const numRows = X.extent(0); + + int a(2); + if (av.extent(0) == 0) { + a = 0; + } + + int b(2); + if (bv.extent(0) == 0) { + b = 0; + } + + if (numRows < static_cast(INT_MAX)) { + using index_type = int; + Axpby_Generic( + space, av, X, bv, Y, 0, a, b); + } else { + using index_type = typename XV::size_type; + Axpby_Generic( + space, av, X, bv, Y, 0, a, b); + } + + Kokkos::Profiling::popRegion(); + } +}; + +// ********************************************************************** +// Partial specialization for XV and YV rank-1 Views: +// --> AV = scalar and BV = scalar +// +// If axpby() runs at the host with rank-1 XV and rank-1 YV, then +// the unification process forces AV = scalar and BV = scalar +// ********************************************************************** template struct Axpby { - typedef typename XV::non_const_value_type AV; - typedef typename YV::non_const_value_type BV; - typedef typename YV::size_type size_type; - typedef Kokkos::ArithTraits ATA; - typedef Kokkos::ArithTraits ATB; + using AV = typename XV::non_const_value_type; + using BV = typename YV::non_const_value_type; + using size_type = typename YV::size_type; + using ATA = Kokkos::ArithTraits; + using ATB = Kokkos::ArithTraits; static void axpby(const execution_space& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { @@ -377,8 +454,9 @@ struct Axpby 2 - int b = 2; + int b(2); if (beta == ATB::zero()) { b = 0; } @@ -403,12 +481,12 @@ struct Axpby 2 if (numRows < static_cast(INT_MAX)) { - typedef int index_type; + using index_type = int; Axpby_Generic( space, alpha, X, beta, Y, 0, a, b); } else { - typedef typename XV::size_type index_type; + using index_type = typename XV::size_type; Axpby_Generic( space, alpha, X, beta, Y, 0, a, b); @@ -437,6 +515,20 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + extern template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, false, true>; @@ -448,6 +540,20 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, false, true>; diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp new file mode 100644 index 0000000000..46a29400fd --- /dev/null +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -0,0 +1,1009 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#define KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ + +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// -------------------------------- + +template +constexpr int typeRank() { + if constexpr (Kokkos::is_view_v) { + return T::rank; + } + return -1; +} + +// -------------------------------- + +template +constexpr typename std::enable_if, bool>::type Tr0_val() { + return (T::rank == 0); +} + +template +constexpr typename std::enable_if, bool>::type Tr0_val() { + return false; +} + +// -------------------------------- + +template +constexpr typename std::enable_if, bool>::type Tr1s_val() { + return (T::rank == 1) && (T::rank_dynamic == 0); +} + +template +constexpr typename std::enable_if, bool>::type +Tr1s_val() { + return false; +} + +// -------------------------------- + +template +constexpr typename std::enable_if, bool>::type Tr1d_val() { + return (T::rank == 1) && (T::rank_dynamic == 1); +} + +template +constexpr typename std::enable_if, bool>::type +Tr1d_val() { + return false; +} + +// -------------------------------- + +template +struct getScalarTypeFromView { + using type = void; +}; + +template +struct getScalarTypeFromView { + using type = typename T::value_type; +}; + +// -------------------------------- + +template +struct getLayoutFromView { + using type = void; +}; + +template +struct getLayoutFromView { + using type = typename T::array_layout; +}; + +// -------------------------------- + +template +constexpr bool isTypeComplex() { + return (std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v>); +} + +// -------------------------------- + +template +struct AxpbyUnificationAttemptTraits { + static constexpr bool atDevCase = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + static constexpr bool atHostCase = !atDevCase; + + static constexpr bool Asc = !Kokkos::is_view_v; + static constexpr bool Ar0 = Tr0_val(); + static constexpr bool Ar1s = Tr1s_val(); + static constexpr bool Ar1d = Tr1d_val(); + static constexpr bool Avi = Ar0 || Ar1s || Ar1d; + + static constexpr bool Xr1 = Kokkos::is_view_v && (XMV::rank == 1); + static constexpr bool Xr2 = Kokkos::is_view_v && (XMV::rank == 2); + + static constexpr bool Bsc = !Kokkos::is_view_v; + static constexpr bool Br0 = Tr0_val(); + static constexpr bool Br1s = Tr1s_val(); + static constexpr bool Br1d = Tr1d_val(); + static constexpr bool Bvi = Br0 || Br1s || Br1d; + + static constexpr bool Yr1 = Kokkos::is_view_v && (YMV::rank == 1); + static constexpr bool Yr2 = Kokkos::is_view_v && (YMV::rank == 2); + + static constexpr bool xyRank1Case = Xr1 && Yr1; + static constexpr bool xyRank2Case = Xr2 && Yr2; + + // ******************************************************************** + // In order to better understand the lines between now and right before + // the constructor, assume that all constructor checks. + // ******************************************************************** + + // ******************************************************************** + // Declare 'AtInputScalarTypeA' + // ******************************************************************** + using ScalarTypeA2_atDev = + typename getScalarTypeFromView::type; + using ScalarTypeA1_atDev = + std::conditional_t; + + using ScalarTypeA2_atHost = + typename getScalarTypeFromView::type; + using ScalarTypeA1_atHost = + std::conditional_t; + + using AtInputScalarTypeA = + std::conditional_t; + + using AtInputScalarTypeA_nonConst = typename std::conditional_t< + std::is_const_v, + typename std::remove_const::type, AtInputScalarTypeA>; + + static constexpr bool atInputScalarTypeA_isComplex = + isTypeComplex(); + + // ******************************************************************** + // Declare 'AtInputScalarTypeX' + // ******************************************************************** + using AtInputScalarTypeX = + typename XMV::value_type; // 'const' not removed if present + + using AtInputScalarTypeX_nonConst = typename std::conditional_t< + std::is_const_v, + typename std::remove_const::type, AtInputScalarTypeX>; + + static constexpr bool atInputScalarTypeX_isComplex = + isTypeComplex(); + + // ******************************************************************** + // Declare 'AtInputScalarTypeB' + // ******************************************************************** + using ScalarTypeB2_atDev = + typename getScalarTypeFromView::type; + using ScalarTypeB1_atDev = + std::conditional_t; + + using ScalarTypeB2_atHost = + typename getScalarTypeFromView::type; + using ScalarTypeB1_atHost = + std::conditional_t; + + using AtInputScalarTypeB = + std::conditional_t; + + using AtInputScalarTypeB_nonConst = typename std::conditional_t< + std::is_const_v, + typename std::remove_const::type, AtInputScalarTypeB>; + + static constexpr bool atInputScalarTypeB_isComplex = + isTypeComplex(); + + // ******************************************************************** + // Declare 'AtInputScalarTypeY' + // ******************************************************************** + using AtInputScalarTypeY = + typename YMV::value_type; // 'const' not removed if present + + using AtInputScalarTypeY_nonConst = typename std::conditional_t< + std::is_const_v, + typename std::remove_const::type, AtInputScalarTypeY>; + + static constexpr bool atInputScalarTypeY_isComplex = + isTypeComplex(); + + // ******************************************************************** + // Declare internal layouts + // ******************************************************************** + using InternalLayoutX = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using InternalLayoutY = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + YMV, InternalLayoutX>::array_layout; + + // ******************************************************************** + // Declare 'InternalTypeA_tmp' + // ******************************************************************** + using AtInputLayoutA = typename getLayoutFromView::type; + static constexpr bool atInputLayoutA_isStride = + std::is_same_v; + using InternalLayoutA = + std::conditional_t<(Ar1d || Ar1s) && atInputLayoutA_isStride, + AtInputLayoutA, InternalLayoutX>; + + static constexpr bool atInputScalarTypeA_mustRemain = + atInputScalarTypeA_isComplex && !atInputScalarTypeX_isComplex; + + using InternalScalarTypeA = std::conditional_t< + atInputScalarTypeA_mustRemain || ((Ar1d || Ar1s) && xyRank2Case), + AtInputScalarTypeA_nonConst // Yes, keep the input scalar type + , + AtInputScalarTypeX_nonConst // Yes, instead of + // 'AtInputScalarTypeA_nonConst' + >; + + using InternalTypeA_atDev = + Kokkos::View>; + + using InternalTypeA_atHost = std::conditional_t< + (Ar1d || Ar1s) && xyRank2Case && atHostCase, + Kokkos::View>, + InternalScalarTypeA>; + + using InternalTypeA_tmp = + std::conditional_t; + + // ******************************************************************** + // Declare 'InternalTypeX' + // ******************************************************************** + using InternalTypeX = std::conditional_t< + Xr2, + Kokkos::View>, + Kokkos::View>>; + + // ******************************************************************** + // Declare 'InternalTypeB_tmp' + // ******************************************************************** + using AtInputLayoutB = typename getLayoutFromView::type; + static constexpr bool atInputLayoutB_isStride = + std::is_same_v; + using InternalLayoutB = + std::conditional_t<(Br1d || Br1s) && atInputLayoutB_isStride, + AtInputLayoutB, InternalLayoutY>; + + static constexpr bool atInputScalarTypeB_mustRemain = + atInputScalarTypeB_isComplex && !atInputScalarTypeY_isComplex; + + using InternalScalarTypeB = std::conditional_t< + atInputScalarTypeB_mustRemain || ((Br1d || Br1s) && xyRank2Case), + AtInputScalarTypeB_nonConst // Yes, keep the input scalar type + , + AtInputScalarTypeY_nonConst // Yes, instead of + // 'AtInputScalarTypeB_nonConst' + >; + + using InternalTypeB_atDev = + Kokkos::View>; + + using InternalTypeB_atHost = std::conditional_t< + ((Br1d || Br1s) && xyRank2Case && atHostCase), + Kokkos::View>, + InternalScalarTypeB>; + + using InternalTypeB_tmp = + std::conditional_t; + + // ******************************************************************** + // Declare 'InternalTypeY' + // ******************************************************************** + using InternalTypeY = std::conditional_t< + Yr2, + Kokkos::View>, + Kokkos::View>>; + + // ******************************************************************** + // Declare 'InternalTypeA': if 'InternalTypeB_tmp' is a view then + // make sure 'InternalTypeA' is a view as well + // ******************************************************************** + using InternalTypeA = std::conditional_t< + !Kokkos::is_view_v && + Kokkos::is_view_v, + Kokkos::View>, + InternalTypeA_tmp>; + + // ******************************************************************** + // Declare 'InternalTypeA_managed' with the same scalar type in + // 'InternalTypeA' + // ******************************************************************** + using InternalLayoutA_managed = InternalLayoutA; + using InternalTypeA_managed = std::conditional_t< + Kokkos::is_view_v, + Kokkos::View, + void>; + + // ******************************************************************** + // Declare 'InternalTypeB' if 'InternalTypeA_tmp' is a view then + // make sure 'InternalTypeB' is a view as well + // ******************************************************************** + using InternalTypeB = std::conditional_t< + Kokkos::is_view_v && + !Kokkos::is_view_v, + Kokkos::View>, + InternalTypeB_tmp>; + + // ******************************************************************** + // Declare 'InternalTypeB_managed' with the same scalar type in + // 'InternalTypeB' + // ******************************************************************** + using InternalLayoutB_managed = InternalLayoutB; + using InternalTypeB_managed = std::conditional_t< + Kokkos::is_view_v, + Kokkos::View, + void>; + + // ******************************************************************** + // Auxiliary Boolean results on internal types + // ******************************************************************** + static constexpr bool internalTypeA_sc = !Kokkos::is_view_v; + static constexpr bool internalTypeA_r1d = Tr1d_val(); + + static constexpr bool internalTypeB_sc = !Kokkos::is_view_v; + static constexpr bool internalTypeB_r1d = Tr1d_val(); + + static constexpr bool internalTypesAB_bothScalars = + (internalTypeA_sc && internalTypeB_sc); + static constexpr bool internalTypesAB_bothViews = + (internalTypeA_r1d && internalTypeB_r1d); + + static void performChecks(const AV& a, const XMV& X, const BV& b, + const YMV& Y) { + // ****************************************************************** + // Check 1/6: General checks + // ****************************************************************** + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + ": tExecSpace must be a valid Kokkos execution space."); + + if constexpr ((xyRank1Case && !xyRank2Case) || + (!xyRank1Case && xyRank2Case)) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 1/6" + << ", invalid general case" + << ": xyRank1Case = " << xyRank1Case + << ", xyRank2Case = " << xyRank2Case; + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + if constexpr (atInputScalarTypeY_isComplex == false) { + if constexpr ((atInputScalarTypeA_isComplex == false) && + (atInputScalarTypeX_isComplex == false) && + (atInputScalarTypeB_isComplex == false)) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 1/6" + << ", invalid combination on scalar types: if Y is not complex, " + "then A, X and B cannot be complex" + << ": AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() + << ", AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() + << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() + << ", AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name(); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } + + // ****************************************************************** + // Check 2/6: YMV is valid + // ****************************************************************** + static_assert(Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + ": Y is not a Kokkos::View."); + static_assert(std::is_same::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + ": Y is const. It must be nonconst, " + "because it is an output argument " + "(we must be able to write to its entries)."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + ": XMV must be accessible from tExecSpace"); + + if constexpr ((Yr1 && !Yr2) || (!Yr1 && Yr2)) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 2/6" + << ", invalid YMV" + << ": Yr1 = " << Yr1 << ", Yr2 = " << Yr2; + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + // ****************************************************************** + // Check 3/6: XMV is valid + // ****************************************************************** + static_assert(Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + ": X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + ": XMV must be accessible from tExecSpace"); + + if constexpr ((Xr1 && !Xr2) || (!Xr1 && Xr2)) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 3/6" + << ", invalid XMV" + << ": Xr1 = " << Xr1 << ", Xr2 = " << Xr2; + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + if constexpr (xyRank1Case) { + if (X.extent(0) == Y.extent(0)) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 3/6" + << ", invalid rank-1 X extent" + << ": X.extent(0) = " << X.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + if ((X.extent(0) == Y.extent(0)) && (X.extent(1) == Y.extent(1))) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 3/6" + << ", invalid rank-2 X extents" + << ": X.extent(0) = " << X.extent(0) + << ", X.extent(1) = " << X.extent(1) + << ", Y.extent(0) = " << Y.extent(0) + << ", Y.extent(1) = " << Y.extent(1); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } + + // ****************************************************************** + // Check 4/6: AV is valid + // ****************************************************************** + if constexpr ((Asc && !Ar0 && !Ar1s && !Ar1d) || + (!Asc && Ar0 && !Ar1s && !Ar1d) || + (!Asc && !Ar0 && Ar1s && !Ar1d) || + (!Asc && !Ar0 && !Ar1s && Ar1d)) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" + << ", invalid AV = " << typeid(AV).name() << ": Asc = " << Asc + << ", Ar0 = " << Ar0 << ", Ar1s = " << Ar1s << ", Ar1d = " << Ar1d; + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + if constexpr (Asc || Avi) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" + << ", AV memory must be either scalar or view" + << ": Asc = " << Asc << ", Avi = " << Avi; + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + if constexpr (Ar1d || Ar1s) { + if constexpr (xyRank1Case) { + if (a.extent(0) == 1) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" + << ", view 'a' must have extent(0) == 1 for xyRank1Case" + << ": a.extent(0) = " << a.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + if ((a.extent(0) == 1) || + (a.extent(0) == Y.extent(1))) { // Yes, 'Y' is the reference + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" + << ", view 'a' must have extent(0) == 1 or Y.extent(1) for " + "xyRank2Case" + << ": a.extent(0) = " << a.extent(0) + << ", Y.extent(0) = " << Y.extent(0) + << ", Y.extent(1) = " << Y.extent(1); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } // if (rank1Case) else + } // if Ar1d + + // ****************************************************************** + // Check 5/6: BV is valid + // ****************************************************************** + if constexpr ((Bsc && !Br0 && !Br1s && !Br1d) || + (!Bsc && Br0 && !Br1s && !Br1d) || + (!Bsc && !Br0 && Br1s && !Br1d) || + (!Bsc && !Br0 && !Br1s && Br1d)) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" + << ", invalid BV" + << ": Bsc = " << Bsc << ", Br0 = " << Br0 << ", Br1s = " << Br1s + << ", Br1d = " << Br1d; + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + if constexpr (Bsc || Bvi) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" + << ", BV memory must be either scalar or view" + << ": Bsc = " << Bsc << ", Bvi = " << Bvi; + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + if constexpr (Br1d || Br1s) { + if constexpr (xyRank1Case) { + if (b.extent(0) == 1) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" + << ", view 'b' must have extent(0) == 1 for xyRank1Case" + << ": b.extent(0) = " << b.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + if ((b.extent(0) == 1) || (b.extent(0) == Y.extent(1))) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" + << ", view 'b' must have extent(0) == 1 or Y.extent(1) for " + "xyRank2Case" + << ": b.extent(0) = " << b.extent(0) + << ", Y.extent(0) = " << Y.extent(0) + << ", Y.extent(1) = " << Y.extent(1); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } // if (rank1Case) else + } // if Br1d + + // ****************************************************************** + // Check 6/6: Checks on InternalTypeA, X, B, Y + // ****************************************************************** + if constexpr (atHostCase) { + if constexpr (xyRank1Case) { + constexpr bool internalTypeA_isOk = + (internalTypeA_sc || internalTypeA_r1d); + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + constexpr bool internalTypeB_isOk = + (internalTypeB_sc || internalTypeB_r1d); + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + if constexpr (internalTypeA_isOk && internalTypeX_isOk && + internalTypeB_isOk && internalTypeY_isOk) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " + "6.1/6" + << ", invalid internal types" + << ": atHostCase = " << atHostCase + << ", atDevCase = " << atDevCase + << ", xyRank1Case= " << xyRank1Case + << ", xyRank2Case= " << xyRank2Case + << ", InternalTypeA = " << typeid(InternalTypeA).name() + << ", InternalTypeX = " << typeid(InternalTypeX).name() + << ", InternalTypeB = " << typeid(InternalTypeB).name() + << ", InternalTypeY = " << typeid(InternalTypeY).name(); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + constexpr bool internalTypeA_isOk = + (internalTypeA_sc || internalTypeA_r1d); + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + constexpr bool internalTypeB_isOk = + (internalTypeB_sc || internalTypeB_r1d); + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + if constexpr (internalTypeA_isOk && internalTypeX_isOk && + internalTypeB_isOk && internalTypeY_isOk) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " + "6.2/6" + << ", invalid internal types" + << ": atHostCase = " << atHostCase + << ", atDevCase = " << atDevCase + << ", xyRank1Case= " << xyRank1Case + << ", xyRank2Case= " << xyRank2Case + << ", InternalTypeA = " << typeid(InternalTypeA).name() + << ", InternalTypeX = " << typeid(InternalTypeX).name() + << ", InternalTypeB = " << typeid(InternalTypeB).name() + << ", InternalTypeY = " << typeid(InternalTypeY).name(); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } + } else { + if constexpr (xyRank1Case) { + constexpr bool internalTypeA_isOk = internalTypeA_r1d; + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + constexpr bool internalTypeB_isOk = internalTypeB_r1d; + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + if constexpr (internalTypeA_isOk && internalTypeX_isOk && + internalTypeB_isOk && internalTypeY_isOk) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " + "6.3/6" + << ", invalid internal types" + << ": atHostCase = " << atHostCase + << ", atDevCase = " << atDevCase + << ", xyRank1Case= " << xyRank1Case + << ", xyRank2Case= " << xyRank2Case + << ", InternalTypeA = " << typeid(InternalTypeA).name() + << ", InternalTypeX = " << typeid(InternalTypeX).name() + << ", InternalTypeB = " << typeid(InternalTypeB).name() + << ", InternalTypeY = " << typeid(InternalTypeY).name(); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + constexpr bool internalTypeA_isOk = internalTypeA_r1d; + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + constexpr bool internalTypeB_isOk = internalTypeB_r1d; + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + if constexpr (internalTypeA_isOk && internalTypeX_isOk && + internalTypeB_isOk && internalTypeY_isOk) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " + "6.4/6" + << ", invalid internal types" + << ": atHostCase = " << atHostCase + << ", atDevCase = " << atDevCase + << ", xyRank1Case= " << xyRank1Case + << ", xyRank2Case= " << xyRank2Case + << ", InternalTypeA = " << typeid(InternalTypeA).name() + << ", InternalTypeX = " << typeid(InternalTypeX).name() + << ", InternalTypeB = " << typeid(InternalTypeB).name() + << ", InternalTypeY = " << typeid(InternalTypeY).name(); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } + } + + if constexpr (atHostCase) { + // **************************************************************** + // We are in the 'atHostCase' case, with 2 possible subcases:: + // + // 1) xyRank1Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] + // + // or + // + // 2) xyRank2Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] + // **************************************************************** + static_assert( + internalTypesAB_bothScalars || internalTypesAB_bothViews, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), atHostCase, " + "invalid combination of types"); + } // If atHostCase + else if constexpr (atDevCase) { + // **************************************************************** + // We are in the 'atDevCase' case, with 2 possible subcases: + // + // 1) xyRank1Case, with only one possible situation: + // - [InternalTypeA / B] = [view, view] + // + // or + // + // 2) xyRank2Case, with only one possible situation: + // - [InternalTypeA / B] = [view, view] + // **************************************************************** + static_assert( + internalTypesAB_bothViews, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), atDevCase, " + "invalid combination of types"); + } + + if constexpr (xyRank2Case && (Ar1d || Ar1s) && atInputLayoutA_isStride) { + if (std::is_same_v< + typename getLayoutFromView< + InternalTypeA, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 6.5/6" + << ", xyRank2Case = " << xyRank2Case + << ", coeff 'a' is rank-1 and has LayoutStride at input, but no " + "LayoutStride internally" + << ", AV = " << typeid(AV).name() + << ", InternalTypeA = " << typeid(InternalTypeA).name(); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } + + if constexpr (xyRank2Case && (Br1d || Br1s) && atInputLayoutB_isStride) { + if (std::is_same_v< + typename getLayoutFromView< + InternalTypeB, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 6.6/6" + << ", xyRank2Case = " << xyRank2Case + << ", coeff 'a' is rank-1 and has LayoutStride at input, but no " + "LayoutStride internally" + << ", BV = " << typeid(BV).name() + << ", InternalTypeB = " << typeid(InternalTypeB).name(); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } + } // Constructor + + static void printInformation(std::ostream& os, std::string const& headerMsg) { + os << headerMsg << ": AV = " + << typeid(AV).name() + //<< ", AV::const_data_type = " << typeid(AV::const_data_type).name() + //<< ", AV::non_const_data_type = " << + // typeid(AV::non_const_data_type).name() + << ", AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() + << ", isConst = " + << std::is_const_v << ", isComplex = " + << atInputScalarTypeA_isComplex << ", AtInputScalarTypeA_nonConst = " + << typeid(AtInputScalarTypeA_nonConst).name() + << ", InternalTypeA = " << typeid(InternalTypeA).name() << "\n" + << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() + << "\n" + << "\n" + << "XMV = " << typeid(XMV).name() << "\n" + << "XMV::value_type = " << typeid(typename XMV::value_type).name() + << "\n" + << "XMV::const_data_type = " + << typeid(typename XMV::const_data_type).name() << "\n" + << "XMV::non_const_data_type = " + << typeid(typename XMV::non_const_data_type).name() << "\n" + << "AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() << "\n" + << "isConst = " << std::is_const_v << "\n" + << "isComplex = " << atInputScalarTypeX_isComplex << "\n" + << "AtInputScalarTypeX_nonConst = " + << typeid(AtInputScalarTypeX_nonConst).name() << "\n" + << "InternalTypeX = " << typeid(InternalTypeX).name() << "\n" + << "\n" + << "BV = " + << typeid(BV).name() + //<< ", BV::const_data_type = " << typeid(BV::const_data_type).name() + //<< ", BV::non_const_data_type = " << + // typeid(BV::non_const_data_type).name() + << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() + << ", isConst = " + << std::is_const_v << ", isComplex = " + << atInputScalarTypeB_isComplex << ", AtInputScalarTypeB_nonConst = " + << typeid(AtInputScalarTypeB_nonConst).name() + << ", InternalTypeB = " << typeid(InternalTypeB).name() << "\n" + << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() + << "\n" + << "\n" + << "YMV = " << typeid(YMV).name() << "\n" + << "YMV::value_type = " << typeid(typename YMV::value_type).name() + << "\n" + << "YMV::const_data_type = " + << typeid(typename YMV::const_data_type).name() << "\n" + << "YMV::non_const_data_type = " + << typeid(typename YMV::non_const_data_type).name() << "\n" + << "AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name() << "\n" + << "isConst = " << std::is_const_v << "\n" + << "isComplex = " << atInputScalarTypeY_isComplex << "\n" + << "AtInputScalarTypeY_nonConst = " + << typeid(AtInputScalarTypeY_nonConst).name() << "\n" + << "InternalTypeY = " << typeid(InternalTypeY).name() << "\n" + << std::endl; + } +}; // struct AxpbyUnificationAttemptTraits + +// -------------------------------- + +template +struct getScalarValueFromVariableAtHost { + getScalarValueFromVariableAtHost() { + static_assert((rankT == -1) || (rankT == 0) || (rankT == 1), + "Generic struct should not have been invoked!"); + } +}; + +template +struct getScalarValueFromVariableAtHost { + static T getValue(T const& var) { return var; } +}; + +template +struct getScalarValueFromVariableAtHost { + static typename T::value_type getValue(T const& var) { return var(); } +}; + +template +struct getScalarValueFromVariableAtHost { + static typename T::value_type getValue(T const& var) { return var[0]; } +}; + +// -------------------------------- + +template +size_t getAmountOfScalarsInCoefficient(T const& coeff) { + size_t result = 1; + if constexpr (Kokkos::is_view_v) { + if constexpr (T::rank == 1) { + result = coeff.extent(0); + } + } + return result; +} + +// -------------------------------- + +template +size_t getStrideInCoefficient(T const& coeff) { + size_t result = 1; + if constexpr (Kokkos::is_view_v) { + if constexpr ((T::rank == 1) && (std::is_same_v)) { + result = coeff.stride_0(); + } + } + return result; +} + +// -------------------------------- + +template +static void populateRank1Stride1ViewWithScalarOrNonStrideView( + T_in const& coeff_in, T_out& coeff_out) { + // *********************************************************************** + // 'coeff_out' is assumed to be rank-1, of LayoutLeft or LayoutRight + // + // One has to be careful with situations like the following: + // - a coeff_in that deals with 'double', and + // - a coeff_out deals with 'complex' + // *********************************************************************** + using ScalarOutType = + typename std::remove_const::type; + + if constexpr (!Kokkos::is_view_v) { + // ********************************************************************* + // 'coeff_in' is scalar + // ********************************************************************* + ScalarOutType scalarValue(coeff_in); + Kokkos::deep_copy(coeff_out, scalarValue); + } else if constexpr (T_in::rank == 0) { + // ********************************************************************* + // 'coeff_in' is rank-0 + // ********************************************************************* + typename T_in::HostMirror h_coeff_in("h_coeff_in"); + Kokkos::deep_copy(h_coeff_in, coeff_in); + ScalarOutType scalarValue(h_coeff_in()); + Kokkos::deep_copy(coeff_out, scalarValue); + } else { + // ********************************************************************* + // 'coeff_in' is also rank-1 + // ********************************************************************* + if (coeff_out.extent(0) != coeff_in.extent(0)) { + std::ostringstream msg; + msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" + << ": 'in' and 'out' should have the same extent(0)" + << ", T_in = " << typeid(T_in).name() + << ", coeff_in.label() = " << coeff_in.label() + << ", coeff_in.extent(0) = " << coeff_in.extent(0) + << ", T_out = " << typeid(T_out).name() + << ", coeff_out.label() = " << coeff_out.label() + << ", coeff_out.extent(0) = " << coeff_out.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + using ScalarInType = + typename std::remove_const::type; + if constexpr (std::is_same_v) { + coeff_out = coeff_in; + } else if (coeff_out.extent(0) == 1) { + typename T_in::HostMirror h_coeff_in("h_coeff_in"); + Kokkos::deep_copy(h_coeff_in, coeff_in); + ScalarOutType scalarValue(h_coeff_in[0]); + Kokkos::deep_copy(coeff_out, scalarValue); + } else { + std::ostringstream msg; + msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" + << ": scalar types 'in' and 'out' should be the same" + << ", T_in = " << typeid(T_in).name() + << ", ScalarInType = " << typeid(ScalarInType).name() + << ", coeff_in.label() = " << coeff_in.label() + << ", coeff_in.extent(0) = " << coeff_in.extent(0) + << ", T_out = " << typeid(T_out).name() + << ", ScalarOutType = " << typeid(ScalarOutType).name() + << ", coeff_out.label() = " << coeff_out.label() + << ", coeff_out.extent(0) = " << coeff_out.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } +} // populateRank1Stride1ViewWithScalarOrNonStrideView() + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 2f59cb4cce..de1727d817 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -21,120 +21,223 @@ #include #include #include +#include // axpby() accepts both scalar coefficients a and b, and vector // coefficients (apply one for each column of the input multivectors). // This traits class helps axpby() select the correct specialization -// of AV and BV (the type of a resp. b) for invoking the +// of AV (type of 'a') and BV (type of 'b') for invoking the // implementation. namespace KokkosBlas { /// \brief Computes Y := a*X + b*Y /// -/// This function is non-blocking and thread safe. +/// This function is non-blocking and thread-safe. /// -/// \tparam execution_space a Kokkos execution space where the kernel will run. -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam BV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam execution_space The type of execution space where the kernel +/// will run. +/// \tparam AV Scalar or 0-D or 1-D Kokkos::View. +/// \tparam XMV 1-D Kokkos::View or 2-D Kokkos::View. It +/// must have the same rank as YMV. +/// \tparam BV Scalar or 0-D or 1-D Kokkos::View. +/// \tparam YMV 1-D or 2-D Kokkos::View. /// -/// \param space [in] the execution space instance on which the kernel will run. -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param b [in] view of type BV, scaling parameter for Y. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param pExecSpace [in] The execution space instance on which the kernel +/// will run. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param b [in] input of type BV: +/// - scaling parameter for 1-D or 2-D Y, +/// - scaling parameters for 2-D Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template -void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, - const YMV& Y) { - static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::axpby: execution_space must be a valid Kokkos " - "execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::axpby: " - "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::axpby: XMV must be accessible from execution_space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::axpby: " - "Y is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::axpby: XMV must be accessible from execution_space"); - static_assert(std::is_same::value, - "KokkosBlas::axpby: Y is const. It must be nonconst, " - "because it is an output argument " - "(we must be able to write to its entries)."); - static_assert(int(YMV::rank) == int(XMV::rank), - "KokkosBlas::axpby: " - "X and Y must have the same rank."); - static_assert(YMV::rank == 1 || YMV::rank == 2, - "KokkosBlas::axpby: " - "XMV and YMV must either have rank 1 or rank 2."); - - // Check compatibility of dimensions at run time. - if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { - std::ostringstream os; - os << "KokkosBlas::axpby: Dimensions of X and Y do not match: " - << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0) - << " x " << Y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } +void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, + const BV& b, const YMV& Y) { + using AxpbyTraits = + Impl::AxpbyUnificationAttemptTraits; + using InternalTypeA = typename AxpbyTraits::InternalTypeA; + using InternalTypeX = typename AxpbyTraits::InternalTypeX; + using InternalTypeB = typename AxpbyTraits::InternalTypeB; + using InternalTypeY = typename AxpbyTraits::InternalTypeY; + + // ********************************************************************** + // Perform compile time checks and run time checks. + // ********************************************************************** + AxpbyTraits::performChecks(a, X, b, Y); + // AxpbyTraits::printInformation(std::cout, "axpby(), unif information"); + + // ********************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // ********************************************************************** + InternalTypeX internal_X = X; + InternalTypeY internal_Y = Y; + + if constexpr (AxpbyTraits::internalTypesAB_bothScalars) { + InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost< + AV, Impl::typeRank()>::getValue(a)); + InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost< + BV, Impl::typeRank()>::getValue(b)); + + Impl::Axpby::axpby(pExecSpace, internal_a, internal_X, + internal_b, internal_Y); + } else if constexpr (AxpbyTraits::internalTypesAB_bothViews) { + constexpr bool internalLayoutA_isStride( + std::is_same_v); + constexpr bool internalLayoutB_isStride( + std::is_same_v); + + const size_t k_a(Impl::getAmountOfScalarsInCoefficient(a)); + const size_t k_b(Impl::getAmountOfScalarsInCoefficient(b)); - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedYLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - YMV, UnifiedXLayout>::array_layout; - - // Create unmanaged versions of the input Views. XMV and YMV may be - // rank 1 or rank 2. AV and BV may be either rank-1 Views, or - // scalar values. - using XMV_Internal = Kokkos::View >; - using YMV_Internal = Kokkos::View >; - using AV_Internal = - typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; - using BV_Internal = - typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; - - AV_Internal a_internal = a; - XMV_Internal X_internal = X; - BV_Internal b_internal = b; - YMV_Internal Y_internal = Y; - - Impl::Axpby::axpby(space, a_internal, X_internal, b_internal, - Y_internal); + const size_t s_a(Impl::getStrideInCoefficient(a)); + const size_t s_b(Impl::getStrideInCoefficient(b)); + + Kokkos::LayoutStride layoutStrideA{k_a, s_a}; + Kokkos::LayoutStride layoutStrideB{k_b, s_b}; + + InternalTypeA internal_a; + InternalTypeB internal_b; + + if constexpr (internalLayoutA_isStride) { + // ****************************************************************** + // Prepare internal_a + // ****************************************************************** + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", + layoutStrideA); + if constexpr (AxpbyTraits::atInputLayoutA_isStride) { + Kokkos::deep_copy(managed_a, a); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(a, managed_a); + } + internal_a = managed_a; + + if constexpr (internalLayoutB_isStride) { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + layoutStrideB); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(pExecSpace, internal_a, + internal_X, internal_b, + internal_Y); + } else { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", k_b); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(pExecSpace, internal_a, + internal_X, internal_b, + internal_Y); + } + + } else { + // ****************************************************************** + // Prepare internal_a + // ****************************************************************** + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", k_a); + if constexpr (AxpbyTraits::atInputLayoutA_isStride) { + Kokkos::deep_copy(managed_a, a); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(a, managed_a); + } + internal_a = managed_a; + + if constexpr (internalLayoutB_isStride) { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + layoutStrideB); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(pExecSpace, internal_a, + internal_X, internal_b, + internal_Y); + } else { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", k_b); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(pExecSpace, internal_a, + internal_X, internal_b, + internal_Y); + } + } + } } /// \brief Computes Y := a*X + b*Y /// -/// This function is non-blocking and thread-safe +/// This function is non-blocking and thread-safe. /// The kernel is executed in the default stream/queue /// associated with the execution space of XMV. /// -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam BV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam AV Scalar or 0-D Kokkos::View or 1-D Kokkos::View. +/// \tparam XMV 1-D Kokkos::View or 2-D Kokkos::View. It must +/// have the same rank as YMV. +/// \tparam BV Scalar or 0-D Kokkos::View or 1-D Kokkos::View. +/// \tparam YMV 1-D Kokkos::View or 2-D Kokkos::View. /// -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param b [in] view of type BV, scaling parameter for Y. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param b [in] input of type BV: +/// - scaling parameter for 1-D or 2-D Y, +/// - scaling parameters for 2-D Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { axpby(typename XMV::execution_space{}, a, X, b, Y); @@ -142,39 +245,49 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { /// \brief Computes Y := a*X + Y /// -/// This function is non-blocking and thread-safe +/// This function is non-blocking and thread-safe. /// -/// \tparam execution_space a Kokkos execution space where the kernel will run. -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam execution_space The type of execution space where the kernel +/// will run. +/// \tparam AV Scalar or 0-D or 1-D Kokkos::View. +/// \tparam XMV 1-D or 2-D Kokkos::View. It must have the +/// the same rank as YMV. +/// \tparam YMV 1-D or 2-D Kokkos::View. /// -/// \param space [in] the execution space instance on which the kernel will run. -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param pExecSpace [in] The execution space instance on which the kernel +/// will run. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template -void axpy(const execution_space& space, const AV& a, const XMV& X, +void axpy(const execution_space& pExecSpace, const AV& a, const XMV& X, const YMV& Y) { - axpby(space, a, X, + axpby(pExecSpace, a, X, Kokkos::ArithTraits::one(), Y); } /// \brief Computes Y := a*X + Y /// -/// This function is non-blocking and thread-safe +/// This function is non-blocking and thread-safe. /// The kernel is executed in the default stream/queue /// associated with the execution space of XMV. /// -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam AV Scalar or 0-D Kokkos::View or 1-D Kokkos::View. +/// \tparam XMV 1-D Kokkos::View or 2-D Kokkos::View. It must +/// have the same rank as YMV. +/// \tparam YMV 1-D Kokkos::View or 2-D Kokkos::View. /// -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template void axpy(const AV& a, const XMV& X, const YMV& Y) { axpy(typename XMV::execution_space{}, a, X, Y); diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index 1abd288b0f..b370436391 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -24,6 +24,7 @@ #include "Test_Blas1_asum.hpp" #include "Test_Blas1_axpby.hpp" #include "Test_Blas1_axpy.hpp" +#include "Test_Blas1_axpby_unification.hpp" #include "Test_Blas1_dot.hpp" #include "Test_Blas1_iamax.hpp" #include "Test_Blas1_mult.hpp" diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 8d5afb5f0b..299e18e493 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -109,8 +109,6 @@ void impl_test_axpby_mv(int N, int K) { Kokkos::deep_copy(org_y.h_base, y.d_base); Kokkos::deep_copy(x.h_base, x.d_base); - Kokkos::View r("Dot::Result", K); - KokkosBlas::axpby(a, x.d_view, b, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp new file mode 100644 index 0000000000..e468b513b0 --- /dev/null +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -0,0 +1,2204 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operation y += a * x + b * y: +// 01) Type of 'x' and 'a' components: float, double, complex, ... +// 02) Type of 'y' and 'b' components: float, double, complex, ... +// 03) Execution space: serial, threads, OpenMP, Cuda, ... +// 04) Layout of 'x' and 'a' +// 05) Layout of 'y' and 'b' +// 06) Ranks of 'x' and 'y': rank-1 or rank-2 +// 07) Ranks of 'a' and 'b': scalars or rank-0 or rank-1 +// +// Choices (01)-(03) are selected in the routines TEST_F() at the very +// bottom of the file, when calling: +// - either test_axpby_unificationr<...>(), +// - or test_axpby_mv_unificationr<...>(). +// +// Choices (04)-(05) are selected in routines: +// - test_axpby_unification<...>(), when calling +// Test::impl_test_axpby_unification<...>(), and +// - test_axpby_mv_unification<...>(), when calling +// Test::impl_test_axpby_mv_unification<...>(). +// +// Choices (06)-(07) are selected in routines: +// - Test::impl_test_axpby_unification<...>(), through +// 16 different combinations and calls to +// Test::impl_test_axpby_unification_compare<...>(), and +// - Test::impl_test_axpby_mv_unification<...>(), through +// 36 different combinations and calls to +// Test::impl_test_axpby_mv_unification_compare<...>(). +// +// The constexpr integer value 15 for 'numVecsAxpbyTest' was chosen to +// force the test of the three unrolling values 8, 4, and 1, in routine +// Axpby_MV_Invoke_Left<...>(...) in file KokkosBlas1_axpby_mv_impl.hpp +// ********************************************************************** + +#include +#include +#include +#include + +static constexpr int numVecsAxpbyTest = 15; + +namespace Test { + +template +struct getScalarTypeFromT { + using type = T; +}; + +template +struct getScalarTypeFromT { + using type = typename T::value_type; +}; + +template +constexpr bool isRank0() { + if constexpr (Kokkos::is_view_v) { + return (T::rank == 0); + } + return false; +} + +template +void impl_test_axpby_unification_compare( + tA const& a, tX const& x, tB const& b, tY const& y, int N, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = 0, tScalarB const inputValueB = 0) { + using ScalarTypeX = + typename std::remove_const::type; + using ScalarTypeY = + typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarTypeX randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + } + Kokkos::deep_copy(x.h_view, x.d_view); + + { + ScalarTypeY randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + } + tY org_y("Org_Y", N); + Kokkos::deep_copy(org_y.h_view, y.d_view); + + tScalarA valueA(0); + tScalarB valueB(0); + + if constexpr (std::is_same_v) { + valueA = a; + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + Kokkos::deep_copy(b.h_view, b.d_view); + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueA = inputValueA; + } else { + typename tA::HostMirror h_a("h_A"); + Kokkos::deep_copy(h_a, a); + valueA = h_a(); + } + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + Kokkos::deep_copy(b.h_view, b.d_view); + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else { + Kokkos::deep_copy(a.h_view, a.d_view); + valueA = a.h_view(0); + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else { + Kokkos::deep_copy(b.h_view, b.d_view); + valueB = b.h_view(0); + KokkosBlas::axpby(a.d_view, x.d_view, b.d_view, y.d_view); + } + } + + Kokkos::deep_copy(y.h_view, y.d_view); + + for (int i(0); i < N; ++i) { + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + + valueB * org_y.h_view(i)), + y.h_view(i), 2. * max_error); + } +} + +template +void impl_test_axpby_mv_unification_compare( + tA const& a, tX const& x, tB const& b, tY const& y, int N, int K, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = 0, tScalarB const inputValueB = 0) { + using ScalarTypeX = + typename std::remove_const::type; + using ScalarTypeY = + typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarTypeX randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + } + Kokkos::deep_copy(x.h_view, x.d_view); + + { + ScalarTypeY randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + } + tY org_y("Org_Y", N, K); + Kokkos::deep_copy(org_y.h_view, y.d_view); + + // Cannot use "if constexpr (isRank1()) {" because rank-1 variables + // are passed to current routine with view_stride_adapter<...> + bool constexpr aIsRank1 = !std::is_same_v && !isRank0(); + if constexpr (aIsRank1) { + Kokkos::deep_copy(a.h_view, a.d_view); + } + + // Cannot use "if constexpr (isRank1()) {" because rank-1 variables + // are passed to current routine with view_stride_adapter<...> + bool constexpr bIsRank1 = !std::is_same_v && !isRank0(); + if constexpr (bIsRank1) { + Kokkos::deep_copy(b.h_view, b.d_view); + } + + tScalarA valueA(0); + tScalarB valueB(0); + if constexpr (std::is_same_v) { + valueA = a; + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueA = inputValueA; + } else { + typename tA::HostMirror h_a("h_A"); + Kokkos::deep_copy(h_a, a); + valueA = h_a(); + } + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else { + valueA = a.h_view(0); + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else { + valueB = b.h_view(0); + KokkosBlas::axpby(a.d_view, x.d_view, b.d_view, y.d_view); + } + } + + Kokkos::deep_copy(y.h_view, y.d_view); + + for (int i(0); i < N; ++i) { + for (int k(0); k < K; ++k) { + ScalarTypeY vanillaValue(0.); + if constexpr (aIsRank1) { + (void)valueA; // Avoid "set but not used" error + if constexpr (bIsRank1) { + (void)valueB; // Avoid "set but not used" error + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + int b_k(b.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k) + + b.h_view(b_k) * org_y.h_view(i, k)); + } else { + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = static_cast( + a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + } + } else { + if constexpr (bIsRank1) { + (void)valueB; // Avoid "set but not used" error + int b_k(b.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = static_cast( + valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + } else { + vanillaValue = static_cast(valueA * x.h_view(i, k) + + valueB * org_y.h_view(i, k)); + } + } + + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); + } + } +} + +template +void impl_test_axpby_unification(int const N) { + using ViewTypeAr0 = Kokkos::View; + using ViewTypeAr1s_1 = Kokkos::View; + using ViewTypeAr1d = Kokkos::View; + + using ViewTypeX = Kokkos::View; + + using ViewTypeBr0 = Kokkos::View; + using ViewTypeBr1s_1 = Kokkos::View; + using ViewTypeBr1d = Kokkos::View; + + using ViewTypeY = Kokkos::View; + + std::array const valuesA{-1, 0, 1, 3}; + std::array const valuesB{-1, 0, 1, 5}; + + // eps should probably be based on tScalarB since that is the type + // in which the result is computed. + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; + MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); + MagnitudeB const max_val = 10; + MagnitudeB const max_error = + static_cast( + Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + max_val * eps; + + // ************************************************************ + // Case 01/16: Ascalar + Bscalar + // ************************************************************ + // std::cout << "Starting case 01/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + a = valueA; + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 02/16: Ascalar + Br0 + // ************************************************************ + // std::cout << "Starting case 02/16" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + // ViewTypeBr0 b; + // Kokkos::deep_copy(b, valueB); + // //std::cout << "b() = " << b() << std::endl; + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + a = valueA; + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 03/16: Ascalar + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 03/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 04/16: Ascalar + Br1d + // ************************************************************ + // std::cout << "Starting case 04/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 05/16: Ar0 + Bscalar + // ************************************************************ + // std::cout << "Starting case 05/16" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 06/16: Ar0 + Br0 + // ************************************************************ + // std::cout << "Starting case 06/16" << std::endl; + if constexpr ((std::is_same_v) || + (std::is_same_v)) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 07/16: Ar0 + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 07/16" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 08/16: Ar0 + Br1d + // ************************************************************ + // std::cout << "Starting case 08/16" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 09/16: Ar1s_1 + Bscalar + // ************************************************************ + // std::cout << "Starting case 09/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, max_val, + max_error); + } + } + } + + // ************************************************************ + // Case 10/16: Ar1s_1 + Br0 + // ************************************************************ + // std::cout << "Starting case 10/16" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 11/16: Ar1s_1 + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 11/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 12/16: Ar1s_1 + Br1d + // ************************************************************ + // std::cout << "Starting case 12/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 13/16: Ar1d + Bscalar + // ************************************************************ + // std::cout << "Starting case 13/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, max_val, + max_error); + } + } + } + + // ************************************************************ + // Case 14/16: Ar1d + Br0 + // ************************************************************ + // std::cout << "Starting case 14/16" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 15/16: Ar1d + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 15/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 16/16: Ar1d + Br1d + // ************************************************************ + // std::cout << "Starting case 16/16" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, max_val, max_error); + } + } + } +} + +template +void impl_test_axpby_mv_unification(int const N, int const K) { + // std::cout << "=========================================" << std::endl; + // std::cout << "Entering impl_test_axpby_mv_unification()" + // << ": tLayoutA = " << typeid(tLayoutA).name() + // << ": tLayoutX = " << typeid(tLayoutX).name() + // << ", tLayoutB = " << typeid(tLayoutB).name() + // << ": tLayoutY = " << typeid(tLayoutY).name() + // << std::endl; + using ViewTypeAr0 = Kokkos::View; + using ViewTypeAr1s_1 = Kokkos::View; + using ViewTypeAr1s_k = Kokkos::View; // Yes, hard coded + using ViewTypeAr1d = Kokkos::View; + + using ViewTypeX = Kokkos::View; + + using ViewTypeBr0 = Kokkos::View; + using ViewTypeBr1s_1 = Kokkos::View; + using ViewTypeBr1s_k = Kokkos::View; // Yes, hard coded + using ViewTypeBr1d = Kokkos::View; + + using ViewTypeY = Kokkos::View; + + std::array const valuesA{-1, 0, 1, 3}; + std::array const valuesB{-1, 0, 1, 5}; + + // eps should probably be based on tScalarB since that is the type + // in which the result is computed. + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; + MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); + MagnitudeB const max_val = 10; + MagnitudeB const max_error = + static_cast( + Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + max_val * eps; + + // ************************************************************ + // Case 01/36: Ascalar + Bscalar + // ************************************************************ + // std::cout << "Starting case 01/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + a = valueA; + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 02/36: Ascalar + Br0 + // ************************************************************ + // std::cout << "Starting case 02/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + a = valueA; + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 03/36: Ascalar + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 03/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 04/36: Ascalar + Br1s_k + // ************************************************************ + // std::cout << "Starting case 04/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + a = valueA; + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 05/36: Ascalar + Br1d,1 + // ************************************************************ + // std::cout << "Starting case 05/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 06/36: Ascalar + Br1d,k + // ************************************************************ + // std::cout << "Starting case 06/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + a = valueA; + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 07/36: Ar0 + Bscalar + // ************************************************************w + // std::cout << "Starting case 07/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 08/36: Ar0 + Br0 + // ************************************************************ + // std::cout << "Starting case 08/36" << std::endl; + if constexpr ((std::is_same_v) || + (std::is_same_v)) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 09/36: Ar0 + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 09/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 10/36: Ar0 + Br1s_k + // ************************************************************ + // std::cout << "Starting case 10/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 11/36: Ar0 + Br1d,1 + // ************************************************************ + // std::cout << "Starting case 11/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 12/36: Ar0 + Br1d,k + // ************************************************************ + // std::cout << "Starting case 12/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 13/36: Ar1s_1 + Bscalar + // ************************************************************w + // std::cout << "Starting case 13/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + + // ************************************************************ + // Case 14/36: Ar1s_1 + Br0 + // ************************************************************ + // std::cout << "Starting case 14/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 15/36: Ar1s_1 + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 15/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 16/36: Ar1s_1 + Br1s_k + // ************************************************************ + // std::cout << "Starting case 16/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 17/36: Ar1s_1 + Br1d,1 + // ************************************************************ + // std::cout << "Starting case 17/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 18/36: Ar1s_1 + Br1d,k + // ************************************************************ + // std::cout << "Starting case 18/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 19/36: Ar1s_k + Bscalar + // ************************************************************ + // std::cout << "Starting case 19/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + + // ************************************************************ + // Case 20/36: Ar1s_k + Br0 + // ************************************************************ + // std::cout << "Starting case 20/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 21/36: Ar1s_k + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 21/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 22/36: Ar1s_k + Br1s_k + // ************************************************************ + // std::cout << "Starting case 22/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 23/36: Ar1s_k + Br1d,1 + // ************************************************************ + // std::cout << "Starting case 23/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 24/36: Ar1s_k + Br1d,k + // ************************************************************ + // std::cout << "Starting case 24/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 25/36: Ar1d,1 + Bscalar + // ************************************************************w + // std::cout << "Starting case 25/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + + // ************************************************************ + // Case 26/36: Ar1d,1 + Br0 + // ************************************************************ + // std::cout << "Starting case 26/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 27/36: Ar1d,1 + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 27/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 28/36: Ar1d,1 + Br1s_k + // ************************************************************ + // std::cout << "Starting case 28/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 29/36: Ar1d,1 + Br1d,1 + // ************************************************************ + // std::cout << "Starting case 29/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 30/36: Ar1d,1 + Br1d,k + // ************************************************************ + // std::cout << "Starting case 30/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 31/36: Ar1d,k + Bscalar + // ************************************************************w + // std::cout << "Starting case 31/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + + // ************************************************************ + // Case 32/36: Ar1d,k + Br0 + // ************************************************************ + // std::cout << "Starting case 32/36" << std::endl; + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + max_error); + } + } + } + } + + // ************************************************************ + // Case 33/36: Ar1d,k + Br1s_1 + // ************************************************************ + // std::cout << "Starting case 33/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 34/36: Ar1d,k + Br1s_k + // ************************************************************ + // std::cout << "Starting case 34/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 35/36: Ar1d,k + Br1d,1 + // ************************************************************ + // std::cout << "Starting case 35/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 36/36: Ar1d,k + Br1d,k + // ************************************************************ + // std::cout << "Starting case 36/36" << std::endl; + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_view, a.h_view); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_view, b.h_view); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, max_val, max_error); + } + } + } + + // std::cout << "Leaving impl_test_axpby_mv_unification()" << std::endl; + // std::cout << "=========================================" << std::endl; +} + +} // namespace Test + +template +int test_axpby_unification() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>(14); +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); + + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); + + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>(14); + + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>(14); +#endif + return 1; +} + +template +int test_axpby_mv_unification() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( + 14, numVecsAxpbyTest); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>( + 14, numVecsAxpbyTest); +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( + 14, numVecsAxpbyTest); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( + 14, numVecsAxpbyTest); + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( + 14, numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>( + 14, numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>( + 14, numVecsAxpbyTest); +#endif + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_float"); + test_axpby_unification(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_mv_unification_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_float"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double"); + test_axpby_unification(); +} +TEST_F(TestCategory, axpby_mv_unification_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_mv_unification_double"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_unification_complex_double"); + test_axpby_unification, Kokkos::complex, + Kokkos::complex, Kokkos::complex, + TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_mv_unification_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_mv_unification_complex_double"); + test_axpby_mv_unification, Kokkos::complex, + Kokkos::complex, Kokkos::complex, + TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_int"); + test_axpby_unification(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_mv_unification_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_int"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, axpby_unification_double_int) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_unification_double_int"); + test_axpby_unification(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_double_mv_unification_int) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_mv_unification_double_int"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/common/src/KokkosKernels_helpers.hpp b/common/src/KokkosKernels_helpers.hpp index b36360b991..1b725f2f5c 100644 --- a/common/src/KokkosKernels_helpers.hpp +++ b/common/src/KokkosKernels_helpers.hpp @@ -29,11 +29,11 @@ namespace Impl { // Used to reduce number of code instantiations. template struct GetUnifiedLayoutPreferring { - typedef typename std::conditional< - ((ViewType::rank == 1) && (!std::is_same::value)) || - ((ViewType::rank == 0)), - PreferredLayoutType, typename ViewType::array_layout>::type array_layout; + using array_layout = typename std::conditional< + ((ViewType::rank == 1) && !std::is_same_v) || + (ViewType::rank == 0), + PreferredLayoutType, typename ViewType::array_layout>::type; }; template From 9b17fcf0dcac8e626e3401295a2cefc2bbacb620 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 1 Aug 2023 09:20:43 -0600 Subject: [PATCH 017/326] Addressing feedbacks from Luc, plus some small changes here and there: In KokkosBlas1_axpby_unification_attempt.hpp: - Removed unnecessary variables, routines, and checks - Imposed terminology consistency: variable names begin with lower case letters, type names begin with upper case letters - Using static_assert as much as possible - Using 'public' and 'private' keywords accordingly - Improved some explanations and error messages In KokkosBlas1_axpby_spec.hpp: - Replace 'a' and 'b' by 'scalar_x' and 'scalar_y' where appropriate, to keep consistency with the terminology used in the 'impl' and 'mv_impl' files of the axpby operation. - Not using the 'KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY' define anymore. Code is now consistent with the 'old' value 3 for such define. In KokkosBlas1_axpby_impl.hpp and KokkosBlas1_axpby_mv_impl.hpp: - Not using the 'KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY' define anymore. Code is now consistent with the 'old' value 3 for such define. - Using 'if constexpr' whenever possible - Checking that -1 <= scalar_x <= 2 and that -1 <= scalar_y <= 2 - Replaced '} else {' by '} else if (scalar_x == 2)' or by '} else if (scalar_y == 2)', whenever possible - Improved error messages - Improved explanation headers a bit In KokkosBlas1_axpby.hpp: - Renamed some variables to more meaningful names --- blas/impl/KokkosBlas1_axpby_impl.hpp | 216 +++--- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 657 ++++++++---------- blas/impl/KokkosBlas1_axpby_spec.hpp | 148 ++-- ...Blas1_axpby_unification_attempt_traits.hpp | 610 +++++++--------- blas/src/KokkosBlas1_axpby.hpp | 42 +- blas/unit_test/Test_Blas2_syr.hpp | 4 +- 6 files changed, 730 insertions(+), 947 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index 8b70cece42..14856b3bb7 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -19,10 +19,7 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" - -#ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY -#define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY 3 -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY +#include "KokkosKernels_Error.hpp" namespace KokkosBlas { namespace Impl { @@ -56,8 +53,8 @@ axpbyVarExtent(T&) { // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -77,22 +74,26 @@ struct Axpby_Functor { const SizeType startingColumn) : m_x(x), m_y(y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YV::rank == (int)XV::rank, - "KokkosBlas::Impl::" - "Axpby_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": X and Y must have the same rank."); static_assert(YV::rank == 1, - "KokkosBlas::Impl::Axpby_Functor: " - "XV and YV must have rank 1."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": XV and YV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { if (axpbyVarExtent(m_a) > 1) { m_a = Kokkos::subview( @@ -111,82 +112,62 @@ struct Axpby_Functor { // are template parameters), so the compiler should evaluate these // branches at compile time. -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY <= 2 - - if (scalar_x == 0) { - if (scalar_y == 0) { - m_y(i) = ATS::zero(); - } else { // (scalar_y == 2) - m_y(i) = m_b(0) * m_y(i); - } - } else { // (scalar_x == 2) - if (scalar_y == 0) { - m_y(i) = m_a(0) * m_x(i); - } else { // (scalar_y == 2) - m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); - } - } - -#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // ************************************************************** // Possibilities with 'scalar_x == 0' // ************************************************************** - if (scalar_x == 0) { - if (scalar_y == 0) { + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { m_y(i) = ATS::zero(); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = -m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { // Nothing to do: m_y(i) = m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = m_b(0) * m_y(i); } } // ************************************************************** // Possibilities with 'scalar_x == -1' // ************************************************************** - else if (scalar_x == -1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { m_y(i) = -m_x(i); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = -m_x(i) - m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { m_y(i) = -m_x(i) + m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = -m_x(i) + m_b(0) * m_y(i); } } // ************************************************************** // Possibilities with 'scalar_x == 1' // ************************************************************** - else if (scalar_x == 1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { m_y(i) = m_x(i); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = m_x(i) - m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { m_y(i) = m_x(i) + m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = m_x(i) + m_b(0) * m_y(i); } } // ************************************************************** // Possibilities with 'scalar_x == 2' // ************************************************************** - else { // (scalar_x == 2) - if (scalar_y == 0) { + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { m_y(i) = m_a(0) * m_x(i); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = m_a(0) * m_x(i) - m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { m_y(i) = m_a(0) * m_x(i) + m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); } } - -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY } }; @@ -201,8 +182,8 @@ struct Axpby_Functor { // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -225,22 +206,26 @@ struct Axpby_Functor::value, - "KokkosBlas::Impl::" - "Axpby_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_Functor: R is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YV::rank == (int)XV::rank, - "KokkosBlas::Impl::" - "Axpby_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": X and Y must have the same rank."); static_assert(YV::rank == 1, - "KokkosBlas::Impl::Axpby_Functor: " + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" "XV and YV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -249,82 +234,62 @@ struct Axpby_Functor(ATS::zero()); - } else { // (scalar_y == 2) - m_y(i) = static_cast(m_b * m_y(i)); - } - } else { // (scalar_x == 2) - if (scalar_y == 0) { - m_y(i) = static_cast(m_a * m_x(i)); - } else { // (scalar_y == 2) - m_y(i) = static_cast(m_a * m_x(i) + - m_b * m_y(i)); - } - } -#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // ************************************************************** // Possibilities with 'scalar_x == 0' // ************************************************************** - if (scalar_x == 0) { - if (scalar_y == 0) { + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { m_y(i) = ATS::zero(); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = -m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { // Nothing to do: m_y(i) = m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = m_b * m_y(i); } } // ************************************************************** // Possibilities with 'scalar_x == -1' // ************************************************************** - else if (scalar_x == -1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { m_y(i) = -m_x(i); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = -m_x(i) - m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { m_y(i) = -m_x(i) + m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = -m_x(i) + m_b * m_y(i); } } // ************************************************************** // Possibilities with 'scalar_x == 1' // ************************************************************** - else if (scalar_x == 1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { m_y(i) = m_x(i); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = m_x(i) - m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { m_y(i) = m_x(i) + m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = m_x(i) + m_b * m_y(i); } } // ************************************************************** // Possibilities with 'scalar_x == 2' // ************************************************************** - else { // (scalar_x == 2) - if (scalar_y == 0) { + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { m_y(i) = m_a * m_x(i); - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { m_y(i) = m_a * m_x(i) - m_y(i); - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { m_y(i) = m_a * m_x(i) + m_y(i); - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { m_y(i) = m_a * m_x(i) + m_b * m_y(i); } } - -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY } }; @@ -358,6 +323,15 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, "KokkosBlas::Impl::Axpby_Generic: " "XV and YV must have rank 1."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_Generic()" + ": scalar_x and/or scalar_y are out of range."); + } + const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); @@ -369,9 +343,7 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op); - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (scalar_y == -1) { + } else if (scalar_y == -1) { Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op); @@ -379,15 +351,12 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op); - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op); } } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // **************************************************************** // Possibilities with 'scalar_x == -1' // **************************************************************** @@ -404,7 +373,7 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op); - } else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op); @@ -426,24 +395,21 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op); - } else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op); } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // **************************************************************** // Possibilities with 'scalar_x == 2' // **************************************************************** - else { // (scalar_x == 2) + else if (scalar_x == 2) { if (scalar_y == 0) { Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op); - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (scalar_y == -1) { + } else if (scalar_y == -1) { Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op); @@ -451,9 +417,7 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op); - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op); diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 52ccc8de60..20cf65582a 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -35,8 +35,8 @@ namespace Impl { // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -54,37 +54,39 @@ struct Axpby_MV_Functor { Axpby_MV_Functor(const XMV& X, const YMV& Y, const AV& av, const BV& bv) : numCols(X.extent(1)), m_x(X), m_y(Y), m_a(av), m_b(bv) { - // XMV and YMV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: a is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": 'a' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: b is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": 'b' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: Y is not a Kokkos::View."); - // YMV must be nonconst (else it can't be an output argument). + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": XMV and YMV must have rank 2."); static_assert(AV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "AV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": AV must have rank 1."); static_assert(BV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "BV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": BV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -96,8 +98,8 @@ struct Axpby_MV_Functor { // ************************************************************** // Possibilities with 'scalar_x == 0' // ************************************************************** - if (scalar_x == 0) { - if (scalar_y == 0) { + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -107,7 +109,7 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = ATS::zero(); } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -117,9 +119,9 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_y(i, k); } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { // Nothing to do: Y(i,j) := Y(i,j) - } else { // if (scalar_y == 2) { + } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -146,8 +148,8 @@ struct Axpby_MV_Functor { // ************************************************************** // Possibilities with 'scalar_x == -1' // ************************************************************** - else if (scalar_x == -1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -157,7 +159,7 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_x(i, k); } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -167,7 +169,7 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_x(i, k) - m_y(i, k); } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -177,7 +179,7 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_x(i, k) + m_y(i, k); } - } else { // if (scalar_y == 2) { + } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -204,8 +206,8 @@ struct Axpby_MV_Functor { // ************************************************************** // Possibilities with 'scalar_x == 1' // ************************************************************** - else if (scalar_x == 1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -215,7 +217,7 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_x(i, k); } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -225,7 +227,7 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_x(i, k) - m_y(i, k); } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -235,7 +237,7 @@ struct Axpby_MV_Functor { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_x(i, k) + m_y(i, k); } - } else { // if (scalar_y == 2) { + } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -262,8 +264,8 @@ struct Axpby_MV_Functor { // ************************************************************** // Possibilities with 'scalar_x == 2' // ************************************************************** - else { // if (scalar_x == 2) { - if (scalar_y == 0) { + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -285,7 +287,7 @@ struct Axpby_MV_Functor { m_y(i, k) = m_a(k) * m_x(i, k); } } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -307,7 +309,7 @@ struct Axpby_MV_Functor { m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); } } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -329,7 +331,7 @@ struct Axpby_MV_Functor { m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); } } - } else { // if (scalar_y == 2) { + } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -375,8 +377,8 @@ struct Axpby_MV_Functor { } } } - } // if (scalar_y == ...) else if - } // if (scalar_x == ...) else if + } // if constexpr (scalar_y == ...) else if + } // if constexpr (scalar_x == ...) else if } // void operator() }; @@ -390,8 +392,8 @@ struct Axpby_MV_Functor { // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -416,22 +418,26 @@ struct Axpby_MV_Functor::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": XMV and YMV must have rank 2."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -443,8 +449,8 @@ struct Axpby_MV_Functor::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: a is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": 'a' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: b is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": 'b' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": XMV and YMV must have rank 2."); static_assert(AV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "AV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": AV must have rank 1."); static_assert(BV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "BV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": BV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { if (axpbyVarExtent(m_a) > 1) { @@ -683,117 +693,27 @@ struct Axpby_MV_Unroll_Functor { // are template parameters), so the compiler should evaluate these // branches at compile time. -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY <= 2 - // ************************************************************** // Possibilities with 'scalar_x == 0' // ************************************************************** - if (scalar_x == 0) { - if (scalar_y == 0) { + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = ATS::zero(); } - } else { // (scalar_y == 2) { - if (m_b.extent(0) == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b(0) * m_y(i, k); - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b(k) * m_y(i, k); - } - } - } - } - // ************************************************************** - // Possibilities with 'scalar_x == 2' - // ************************************************************** - else { // (scalar_x == 2) - if (scalar_y == 0) { - if (m_a.extent(0) == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(0) * m_x(i, k); - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k); - } - } - } else { // (scalar_y == 2) - if (m_a.extent(0) == 1) { - if (m_b.extent(0) == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(0) * m_x(i, k) + m_b(k) * m_y(i, k); - } - } - } else { - if (m_b.extent(0) == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); - } - } - } - } - } - -#else // KOKKOSBLAS_OPTIMIZATION_LEVEL >= 3 - - // ************************************************************** - // Possibilities with 'scalar_x == 0' - // ************************************************************** - if (scalar_x == 0) { - if (scalar_y == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); - } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_y(i, k); } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { // Nothing to do: Y(i,j) := Y(i,j) - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -814,29 +734,29 @@ struct Axpby_MV_Unroll_Functor { // ************************************************************** // Possibilities with 'scalar_x == -1' // ************************************************************** - else if (scalar_x == -1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_x(i, k); } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_x(i, k) - m_y(i, k); } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_x(i, k) + m_y(i, k); } - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -857,29 +777,29 @@ struct Axpby_MV_Unroll_Functor { // ************************************************************** // Possibilities with 'scalar_x == 1' // ************************************************************** - else if (scalar_x == 1) { - if (scalar_y == 0) { + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_x(i, k); } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_x(i, k) - m_y(i, k); } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_x(i, k) + m_y(i, k); } - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -900,8 +820,8 @@ struct Axpby_MV_Unroll_Functor { // ************************************************************** // Possibilities with 'scalar_x == 2' // ************************************************************** - else { // (scalar_x == 2) - if (scalar_y == 0) { + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -917,7 +837,7 @@ struct Axpby_MV_Unroll_Functor { m_y(i, k) = m_a(k) * m_x(i, k); } } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -933,7 +853,7 @@ struct Axpby_MV_Unroll_Functor { m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); } } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -949,7 +869,7 @@ struct Axpby_MV_Unroll_Functor { m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); } } - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -985,7 +905,6 @@ struct Axpby_MV_Unroll_Functor { } } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY } }; @@ -1011,22 +930,26 @@ struct Axpby_MV_Unroll_Functor::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": XMV and YMV must have rank 2."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -1035,72 +958,27 @@ struct Axpby_MV_Unroll_Functor 2 - - // ************************************************************** - // Possibilities with 'scalar_x == 0' - // ************************************************************** - if (scalar_x == 0) { - if (scalar_y == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); - } - } else if (scalar_y == -1) { + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_y(i, k); } - } else if (scalar_y == 1) { + } else if constexpr (scalar_y == 1) { // Nothing to do: Y(i,j) := Y(i,j) - } else { // (scalar_y == 2) + } else if constexpr (scalar_y == 2) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -1112,29 +990,29 @@ struct Axpby_MV_Unroll_Functor::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unrolled: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unrolled: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Unrolled: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Unrolled: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Unrolled: " - "XMV and YMV must have rank 2."); - + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": XMV and YMV must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": scalar_x and/or scalar_y are out of range."); + } + const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); @@ -1270,9 +1154,7 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op); - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (scalar_y == -1) { + } else if (scalar_y == -1) { Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op); @@ -1280,15 +1162,12 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op); - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY - else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op); } } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // **************************************************************** // Possibilities with 'scalar_x == -1' // **************************************************************** @@ -1305,7 +1184,7 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op); - } else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op); @@ -1327,24 +1206,21 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op); - } else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op); } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // **************************************************************** // Possibilities with 'scalar_x == 2' // **************************************************************** - else { // (scalar_x == 2) + else if (scalar_x == 2) { if (scalar_y == 0) { Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op); - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (scalar_y == -1) { + } else if (scalar_y == -1) { Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op); @@ -1352,9 +1228,7 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op); - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_MV_Unroll_Functor op( x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op); @@ -1371,10 +1245,10 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // // scalar_x and scalar_y come in as integers. The values -1, 0, and 1 -// correspond to the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: scalar_x == 2 -// means use av, and scalar_y == 2 means use bv. Otherwise, av resp. bv are -// ignored. +// correspond to the literal values of the coefficients. The value 2 +// tells the functor to use the corresponding vector of coefficients: +// - scalar_x == 2 means use av, otherwise ignore av; +// - scalar_y == 2 means use bv, otherwise ignore bv. // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to @@ -1387,22 +1261,30 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Generic: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Generic: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Generic: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Generic: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Generic: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": XMV and YMV must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": scalar_x and/or scalar_y are out of range."); + } const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); @@ -1414,22 +1296,17 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, if (scalar_y == 0) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S16", policy, op); - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (scalar_y == -1) { + } else if (scalar_y == -1) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S17", policy, op); } else if (scalar_y == 1) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S18", policy, op); - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S19", policy, op); } } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // **************************************************************** // Possibilities with 'scalar_x == -1' // **************************************************************** @@ -1443,7 +1320,7 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, } else if (scalar_y == 1) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S22", policy, op); - } else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S23", policy, op); } @@ -1461,30 +1338,25 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, } else if (scalar_y == 1) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S26", policy, op); - } else { // (scalar_y == 2) + } else if (scalar_y == 2) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S27", policy, op); } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // **************************************************************** // Possibilities with 'scalar_x == 2' // **************************************************************** - else { // (scalar_x == 2) + else if (scalar_x == 2) { if (scalar_y == 0) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S28", policy, op); - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (scalar_y == -1) { + } else if (scalar_y == -1) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S29", policy, op); } else if (scalar_y == 1) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S30", policy, op); - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else { // (scalar_x == 2) + } else if (scalar_y == 2) { Axpby_MV_Functor op(x, y, av, bv); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S31", policy, op); } @@ -1500,10 +1372,10 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // // scalar_x and scalar_y come in as integers. The values -1, 0, and 1 -// correspond to the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: scalar_x == 2 -// means use av, and scalar_y == 2 means use bv. Otherwise, av resp. bv are -// ignored. +// correspond to the literal values of the coefficients. The value 2 +// tells the functor to use the corresponding vector of coefficients: +// - scalar_x == 2 means use av, otherwise ignore av; +// - scalar_y == 2 means use bv, otherwise ignore bv. // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to @@ -1517,22 +1389,30 @@ struct Axpby_MV_Invoke_Left { const BV& bv, const YMV& y, int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Left: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Left: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Invoke_Left: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Left: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Invoke_Left: " - "X and Y must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": X and Y must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": scalar_x and/or scalar_y are out of range."); + } const SizeType numCols = x.extent(1); @@ -1584,10 +1464,10 @@ struct Axpby_MV_Invoke_Left { // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // // scalar_x and scalar_y come in as integers. The values -1, 0, and 1 -// correspond to the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: scalar_x == 2 -// means use av, and scalar_y == 2 means use bv. Otherwise, av resp. bv are -// ignored. +// correspond to the literal values of the coefficients. The value 2 +// tells the functor to use the corresponding vector of coefficients: +// - scalar_x == 2 means use av, otherwise ignore av; +// - scalar_y == 2 means use bv, otherwise ignore bv. // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to @@ -1601,23 +1481,32 @@ struct Axpby_MV_Invoke_Right { const BV& bv, const YMV& y, int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Right: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Right: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Invoke_Right: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Right: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Invoke_Right: " - "X and Y must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": X and Y must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && + (-1 <= scalar_y) && (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": scalar_x and/or scalar_y are out of range."); + } + const SizeType numCols = x.extent(1); if (numCols == 1) { auto x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index 1de54e07ca..d36c65f135 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -219,14 +219,42 @@ struct Axpby) { + if constexpr (AV::rank == 1) { + if (av.extent(0) == 0) { + scalar_x = 0; + } + } + } + else { + using ATA = Kokkos::ArithTraits; + if (av == ATA::zero()) { + scalar_x = 0; + } else if (av == -ATA::one()) { + scalar_x = -1; + } else if (av == ATA::one()) { + scalar_x = 1; + } } - int b(2); - if (bv.extent(0) == 0) { - b = 0; + int scalar_y(2); + if constexpr (Kokkos::is_view_v) { + if constexpr (BV::rank == 1) { + if (bv.extent(0) == 0) { + scalar_y = 0; + } + } + } + else { + using ATB = Kokkos::ArithTraits; + if (bv == ATB::zero()) { + scalar_y = 0; + } else if (bv == -ATB::one()) { + scalar_y = -1; + } else if (bv == ATB::one()) { + scalar_y = 1; + } } if (numRows < static_cast(INT_MAX) && @@ -237,7 +265,7 @@ struct Axpby, Axpby_MV_Invoke_Right >::type; - Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } else { using index_type = typename XMV::size_type; using Axpby_MV_Invoke_Layout = typename std::conditional< @@ -245,7 +273,7 @@ struct Axpby, Axpby_MV_Invoke_Right >::type; - Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } @@ -309,29 +337,23 @@ struct Axpby 2 - else if (alpha == -ATA::one()) { - a = -1; + scalar_x = 0; + } else if (alpha == -ATA::one()) { + scalar_x = -1; } else if (alpha == ATA::one()) { - a = 1; + scalar_x = 1; } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - int b(2); + int scalar_y(2); if (beta == ATB::zero()) { - b = 0; - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (beta == -ATB::one()) { - b = -1; + scalar_y = 0; + } else if (beta == -ATB::one()) { + scalar_y = -1; } else if (beta == ATB::one()) { - b = 1; + scalar_y = 1; } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { @@ -341,7 +363,7 @@ struct Axpby, Axpby_MV_Invoke_Right >::type; - Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } else { using index_type = typename XMV::size_type; using Axpby_MV_Invoke_Layout = typename std::conditional< @@ -349,7 +371,7 @@ struct Axpby, Axpby_MV_Invoke_Right >::type; - Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } @@ -375,24 +397,52 @@ struct Axpby) { + if constexpr (AV::rank == 1) { + if (av.extent(0) == 0) { + scalar_x = 0; + } + } + } + else { + using ATA = Kokkos::ArithTraits; + if (av == ATA::zero()) { + scalar_x = 0; + } else if (av == -ATA::one()) { + scalar_x = -1; + } else if (av == ATA::one()) { + scalar_x = 1; + } } - int b(2); - if (bv.extent(0) == 0) { - b = 0; + int scalar_y(2); + if constexpr (Kokkos::is_view_v) { + if constexpr (BV::rank == 1) { + if (bv.extent(0) == 0) { + scalar_y = 0; + } + } + } + else { + using ATB = Kokkos::ArithTraits; + if (bv == ATB::zero()) { + scalar_y = 0; + } else if (bv == -ATB::one()) { + scalar_y = -1; + } else if (bv == ATB::one()) { + scalar_y = 1; + } } if (numRows < static_cast(INT_MAX)) { using index_type = int; Axpby_Generic( - space, av, X, bv, Y, 0, a, b); + space, av, X, bv, Y, 0, scalar_x, scalar_y); } else { using index_type = typename XV::size_type; Axpby_Generic( - space, av, X, bv, Y, 0, a, b); + space, av, X, bv, Y, 0, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); @@ -456,40 +506,34 @@ struct Axpby 2 - else if (alpha == -ATA::one()) { - a = -1; + scalar_x = 0; + } else if (alpha == -ATA::one()) { + scalar_x = -1; } else if (alpha == ATA::one()) { - a = 1; + scalar_x = 1; } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - int b(2); + int scalar_y(2); if (beta == ATB::zero()) { - b = 0; - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (beta == -ATB::one()) { - b = -1; + scalar_y = 0; + } else if (beta == -ATB::one()) { + scalar_y = -1; } else if (beta == ATB::one()) { - b = 1; + scalar_y = 1; } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (numRows < static_cast(INT_MAX)) { using index_type = int; Axpby_Generic( - space, alpha, X, beta, Y, 0, a, b); + space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } else { using index_type = typename XV::size_type; Axpby_Generic( - space, alpha, X, beta, Y, 0, a, b); + space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 46a29400fd..91cd591458 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -97,132 +97,93 @@ struct getLayoutFromView { // -------------------------------- -template -constexpr bool isTypeComplex() { - return (std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v>); -} - -// -------------------------------- - template struct AxpbyUnificationAttemptTraits { - static constexpr bool atDevCase = + // ******************************************************************** + // Terminology: + // - variable names begin with lower case letters + // - type names begin with upper case letters + // ******************************************************************** +private: + static constexpr bool onDevice = KokkosKernels::Impl::kk_is_gpu_exec_space(); - static constexpr bool atHostCase = !atDevCase; + static constexpr bool onHost = !onDevice; - static constexpr bool Asc = !Kokkos::is_view_v; - static constexpr bool Ar0 = Tr0_val(); - static constexpr bool Ar1s = Tr1s_val(); - static constexpr bool Ar1d = Tr1d_val(); - static constexpr bool Avi = Ar0 || Ar1s || Ar1d; + static constexpr bool a_is_scalar = !Kokkos::is_view_v; + static constexpr bool a_is_r0 = Tr0_val(); + static constexpr bool a_is_r1s = Tr1s_val(); + static constexpr bool a_is_r1d = Tr1d_val(); - static constexpr bool Xr1 = Kokkos::is_view_v && (XMV::rank == 1); - static constexpr bool Xr2 = Kokkos::is_view_v && (XMV::rank == 2); + static constexpr bool x_is_r1 = Kokkos::is_view_v && (XMV::rank == 1); + static constexpr bool x_is_r2 = Kokkos::is_view_v && (XMV::rank == 2); - static constexpr bool Bsc = !Kokkos::is_view_v; - static constexpr bool Br0 = Tr0_val(); - static constexpr bool Br1s = Tr1s_val(); - static constexpr bool Br1d = Tr1d_val(); - static constexpr bool Bvi = Br0 || Br1s || Br1d; + static constexpr bool b_is_scalar = !Kokkos::is_view_v; + static constexpr bool b_is_r0 = Tr0_val(); + static constexpr bool b_is_r1s = Tr1s_val(); + static constexpr bool b_is_r1d = Tr1d_val(); - static constexpr bool Yr1 = Kokkos::is_view_v && (YMV::rank == 1); - static constexpr bool Yr2 = Kokkos::is_view_v && (YMV::rank == 2); + static constexpr bool y_is_r1 = Kokkos::is_view_v && (YMV::rank == 1); + static constexpr bool y_is_r2 = Kokkos::is_view_v && (YMV::rank == 2); - static constexpr bool xyRank1Case = Xr1 && Yr1; - static constexpr bool xyRank2Case = Xr2 && Yr2; + static constexpr bool xyRank1Case = x_is_r1 && y_is_r1; + static constexpr bool xyRank2Case = x_is_r2 && y_is_r2; // ******************************************************************** - // In order to better understand the lines between now and right before - // the constructor, assume that all constructor checks. + // Declare 'AtInputScalarTypeA_nonConst' // ******************************************************************** + using ScalarTypeA2_onDevice = + typename getScalarTypeFromView::type; + using ScalarTypeA1_onDevice = + std::conditional_t; - // ******************************************************************** - // Declare 'AtInputScalarTypeA' - // ******************************************************************** - using ScalarTypeA2_atDev = - typename getScalarTypeFromView::type; - using ScalarTypeA1_atDev = - std::conditional_t; - - using ScalarTypeA2_atHost = - typename getScalarTypeFromView::type; - using ScalarTypeA1_atHost = - std::conditional_t; + using ScalarTypeA2_onHost = + typename getScalarTypeFromView::type; + using ScalarTypeA1_onHost = + std::conditional_t; using AtInputScalarTypeA = - std::conditional_t; + std::conditional_t; - using AtInputScalarTypeA_nonConst = typename std::conditional_t< - std::is_const_v, - typename std::remove_const::type, AtInputScalarTypeA>; - - static constexpr bool atInputScalarTypeA_isComplex = - isTypeComplex(); + using AtInputScalarTypeA_nonConst = + typename std::remove_const::type; // ******************************************************************** - // Declare 'AtInputScalarTypeX' + // Declare 'AtInputScalarTypeX_nonConst' // ******************************************************************** - using AtInputScalarTypeX = - typename XMV::value_type; // 'const' not removed if present - - using AtInputScalarTypeX_nonConst = typename std::conditional_t< - std::is_const_v, - typename std::remove_const::type, AtInputScalarTypeX>; + using AtInputScalarTypeX = typename XMV::value_type; - static constexpr bool atInputScalarTypeX_isComplex = - isTypeComplex(); + using AtInputScalarTypeX_nonConst = + typename std::remove_const::type; // ******************************************************************** - // Declare 'AtInputScalarTypeB' + // Declare 'AtInputScalarTypeB_nonConst' // ******************************************************************** - using ScalarTypeB2_atDev = - typename getScalarTypeFromView::type; - using ScalarTypeB1_atDev = - std::conditional_t; + using ScalarTypeB2_onDevice = + typename getScalarTypeFromView::type; + using ScalarTypeB1_onDevice = + std::conditional_t; - using ScalarTypeB2_atHost = - typename getScalarTypeFromView::type; - using ScalarTypeB1_atHost = - std::conditional_t; + using ScalarTypeB2_onHost = + typename getScalarTypeFromView::type; + using ScalarTypeB1_onHost = + std::conditional_t; using AtInputScalarTypeB = - std::conditional_t; - - using AtInputScalarTypeB_nonConst = typename std::conditional_t< - std::is_const_v, - typename std::remove_const::type, AtInputScalarTypeB>; + std::conditional_t; - static constexpr bool atInputScalarTypeB_isComplex = - isTypeComplex(); + using AtInputScalarTypeB_nonConst = + typename std::remove_const::type; // ******************************************************************** - // Declare 'AtInputScalarTypeY' + // Declare 'AtInputScalarTypeY_nonConst' // ******************************************************************** - using AtInputScalarTypeY = - typename YMV::value_type; // 'const' not removed if present + using AtInputScalarTypeY = typename YMV::value_type; - using AtInputScalarTypeY_nonConst = typename std::conditional_t< - std::is_const_v, - typename std::remove_const::type, AtInputScalarTypeY>; - - static constexpr bool atInputScalarTypeY_isComplex = - isTypeComplex(); + using AtInputScalarTypeY_nonConst = + typename std::remove_const::type; // ******************************************************************** - // Declare internal layouts + // Declare 'InternalLayoutX' and 'InternalLayoutY' // ******************************************************************** using InternalLayoutX = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; @@ -233,44 +194,48 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'InternalTypeA_tmp' // ******************************************************************** - using AtInputLayoutA = typename getLayoutFromView::type; + using AtInputLayoutA = typename getLayoutFromView::type; +public: static constexpr bool atInputLayoutA_isStride = std::is_same_v; +private: using InternalLayoutA = - std::conditional_t<(Ar1d || Ar1s) && atInputLayoutA_isStride, + std::conditional_t<(a_is_r1d || a_is_r1s) && atInputLayoutA_isStride, AtInputLayoutA, InternalLayoutX>; static constexpr bool atInputScalarTypeA_mustRemain = - atInputScalarTypeA_isComplex && !atInputScalarTypeX_isComplex; + Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; using InternalScalarTypeA = std::conditional_t< - atInputScalarTypeA_mustRemain || ((Ar1d || Ar1s) && xyRank2Case), + atInputScalarTypeA_mustRemain || ((a_is_r1d || a_is_r1s) && xyRank2Case), AtInputScalarTypeA_nonConst // Yes, keep the input scalar type , AtInputScalarTypeX_nonConst // Yes, instead of // 'AtInputScalarTypeA_nonConst' >; - using InternalTypeA_atDev = + using InternalTypeA_onDevice = Kokkos::View>; - using InternalTypeA_atHost = std::conditional_t< - (Ar1d || Ar1s) && xyRank2Case && atHostCase, + using InternalTypeA_onHost = std::conditional_t< + (a_is_r1d || a_is_r1s) && xyRank2Case && onHost, Kokkos::View>, InternalScalarTypeA>; using InternalTypeA_tmp = - std::conditional_t; + std::conditional_t; // ******************************************************************** // Declare 'InternalTypeX' // ******************************************************************** +public: using InternalTypeX = std::conditional_t< - Xr2, + x_is_r2, Kokkos::View>, @@ -281,44 +246,49 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'InternalTypeB_tmp' // ******************************************************************** - using AtInputLayoutB = typename getLayoutFromView::type; +private: + using AtInputLayoutB = typename getLayoutFromView::type; +public: static constexpr bool atInputLayoutB_isStride = std::is_same_v; +private: using InternalLayoutB = - std::conditional_t<(Br1d || Br1s) && atInputLayoutB_isStride, + std::conditional_t<(b_is_r1d || b_is_r1s) && atInputLayoutB_isStride, AtInputLayoutB, InternalLayoutY>; static constexpr bool atInputScalarTypeB_mustRemain = - atInputScalarTypeB_isComplex && !atInputScalarTypeY_isComplex; + Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; using InternalScalarTypeB = std::conditional_t< - atInputScalarTypeB_mustRemain || ((Br1d || Br1s) && xyRank2Case), + atInputScalarTypeB_mustRemain || ((b_is_r1d || b_is_r1s) && xyRank2Case), AtInputScalarTypeB_nonConst // Yes, keep the input scalar type , AtInputScalarTypeY_nonConst // Yes, instead of // 'AtInputScalarTypeB_nonConst' >; - using InternalTypeB_atDev = + using InternalTypeB_onDevice = Kokkos::View>; - using InternalTypeB_atHost = std::conditional_t< - ((Br1d || Br1s) && xyRank2Case && atHostCase), + using InternalTypeB_onHost = std::conditional_t< + ((b_is_r1d || b_is_r1s) && xyRank2Case && onHost), Kokkos::View>, InternalScalarTypeB>; using InternalTypeB_tmp = - std::conditional_t; + std::conditional_t; // ******************************************************************** // Declare 'InternalTypeY' // ******************************************************************** +public: using InternalTypeY = std::conditional_t< - Yr2, + y_is_r2, Kokkos::View>, @@ -342,7 +312,9 @@ struct AxpbyUnificationAttemptTraits { // Declare 'InternalTypeA_managed' with the same scalar type in // 'InternalTypeA' // ******************************************************************** +private: using InternalLayoutA_managed = InternalLayoutA; +public: using InternalTypeA_managed = std::conditional_t< Kokkos::is_view_v, Kokkos::View, Kokkos::View; - static constexpr bool internalTypeA_r1d = Tr1d_val(); +private: + static constexpr bool internalTypeA_is_scalar = !Kokkos::is_view_v; + static constexpr bool internalTypeA_is_r1d = Tr1d_val(); - static constexpr bool internalTypeB_sc = !Kokkos::is_view_v; - static constexpr bool internalTypeB_r1d = Tr1d_val(); + static constexpr bool internalTypeB_is_scalar = !Kokkos::is_view_v; + static constexpr bool internalTypeB_is_r1d = Tr1d_val(); +public: static constexpr bool internalTypesAB_bothScalars = - (internalTypeA_sc && internalTypeB_sc); + (internalTypeA_is_scalar && internalTypeB_is_scalar); static constexpr bool internalTypesAB_bothViews = - (internalTypeA_r1d && internalTypeB_r1d); + (internalTypeA_is_r1d && internalTypeB_is_r1d); + // ******************************************************************** + // Routine to perform checks (both compile time and run time) + // ******************************************************************** static void performChecks(const AV& a, const XMV& X, const BV& b, const YMV& Y) { // ****************************************************************** // Check 1/6: General checks // ****************************************************************** static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": tExecSpace must be a valid Kokkos execution space."); - if constexpr ((xyRank1Case && !xyRank2Case) || - (!xyRank1Case && xyRank2Case)) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 1/6" - << ", invalid general case" - << ": xyRank1Case = " << xyRank1Case - << ", xyRank2Case = " << xyRank2Case; - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + static_assert((xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": one must have either both X and Y as rank 1, or both X and Y as rank 2"); - if constexpr (atInputScalarTypeY_isComplex == false) { - if constexpr ((atInputScalarTypeA_isComplex == false) && - (atInputScalarTypeX_isComplex == false) && - (atInputScalarTypeB_isComplex == false)) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 1/6" - << ", invalid combination on scalar types: if Y is not complex, " - "then A, X and B cannot be complex" - << ": AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() - << ", AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() - << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() - << ", AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name(); - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + if constexpr (Kokkos::ArithTraits::is_complex == false) { + static_assert((Kokkos::ArithTraits::is_complex == false) && + (Kokkos::ArithTraits::is_complex == false) && + (Kokkos::ArithTraits::is_complex == false), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": if Y is not complex, then A, X and B cannot be complex"); } // ****************************************************************** // Check 2/6: YMV is valid // ****************************************************************** static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": Y is const. It must be nonconst, " "because it is an output argument " "(we must be able to write to its entries)."); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": XMV must be accessible from tExecSpace"); - if constexpr ((Yr1 && !Yr2) || (!Yr1 && Yr2)) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 2/6" - << ", invalid YMV" - << ": Yr1 = " << Yr1 << ", Yr2 = " << Yr2; - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } - // ****************************************************************** // Check 3/6: XMV is valid // ****************************************************************** static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": X is not a Kokkos::View."); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits()" + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": XMV must be accessible from tExecSpace"); - if constexpr ((Xr1 && !Xr2) || (!Xr1 && Xr2)) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 3/6" - << ", invalid XMV" - << ": Xr1 = " << Xr1 << ", Xr2 = " << Xr2; - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } - if constexpr (xyRank1Case) { if (X.extent(0) == Y.extent(0)) { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 3/6" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" << ", invalid rank-1 X extent" - << ": X.extent(0) = " << X.extent(0); + << ": X.extent(0) = " << X.extent(0) + << ", Y.extent(0) = " << Y.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { @@ -490,7 +432,7 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 3/6" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" << ", invalid rank-2 X extents" << ": X.extent(0) = " << X.extent(0) << ", X.extent(1) = " << X.extent(1) @@ -503,38 +445,22 @@ struct AxpbyUnificationAttemptTraits { // ****************************************************************** // Check 4/6: AV is valid // ****************************************************************** - if constexpr ((Asc && !Ar0 && !Ar1s && !Ar1d) || - (!Asc && Ar0 && !Ar1s && !Ar1d) || - (!Asc && !Ar0 && Ar1s && !Ar1d) || - (!Asc && !Ar0 && !Ar1s && Ar1d)) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" - << ", invalid AV = " << typeid(AV).name() << ": Asc = " << Asc - << ", Ar0 = " << Ar0 << ", Ar1s = " << Ar1s << ", Ar1d = " << Ar1d; - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } - - if constexpr (Asc || Avi) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" - << ", AV memory must be either scalar or view" - << ": Asc = " << Asc << ", Avi = " << Avi; - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } - - if constexpr (Ar1d || Ar1s) { + static_assert(( a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": 'a' must be either scalar or rank 0 or rank 1 static or rank 1 dynamic"); + + if constexpr (a_is_r1d || a_is_r1s) { if constexpr (xyRank1Case) { if (a.extent(0) == 1) { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" - << ", view 'a' must have extent(0) == 1 for xyRank1Case" - << ": a.extent(0) = " << a.extent(0); + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + << ": view 'a' must have extent(0) == 1 for xyRank1Case" + << ", a.extent(0) = " << a.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { @@ -543,53 +469,36 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 4/6" - << ", view 'a' must have extent(0) == 1 or Y.extent(1) for " + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + << ": view 'a' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" - << ": a.extent(0) = " << a.extent(0) + << ", a.extent(0) = " << a.extent(0) << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } // if (rank1Case) else - } // if Ar1d + } // if a_is_r1d // ****************************************************************** // Check 5/6: BV is valid // ****************************************************************** - if constexpr ((Bsc && !Br0 && !Br1s && !Br1d) || - (!Bsc && Br0 && !Br1s && !Br1d) || - (!Bsc && !Br0 && Br1s && !Br1d) || - (!Bsc && !Br0 && !Br1s && Br1d)) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" - << ", invalid BV" - << ": Bsc = " << Bsc << ", Br0 = " << Br0 << ", Br1s = " << Br1s - << ", Br1d = " << Br1d; - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } - - if constexpr (Bsc || Bvi) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" - << ", BV memory must be either scalar or view" - << ": Bsc = " << Bsc << ", Bvi = " << Bvi; - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } - - if constexpr (Br1d || Br1s) { + static_assert(( b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": 'b' must be either scalar or rank 0 or rank 1 static or rank 1 dynamic"); + + if constexpr (b_is_r1d || b_is_r1s) { if constexpr (xyRank1Case) { if (b.extent(0) == 1) { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" - << ", view 'b' must have extent(0) == 1 for xyRank1Case" - << ": b.extent(0) = " << b.extent(0); + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + << ": view 'b' must have extent(0) == 1 for xyRank1Case" + << ", b.extent(0) = " << b.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { @@ -597,157 +506,145 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 5/6" - << ", view 'b' must have extent(0) == 1 or Y.extent(1) for " + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + << ": view 'b' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" - << ": b.extent(0) = " << b.extent(0) + << ", b.extent(0) = " << b.extent(0) << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } // if (rank1Case) else - } // if Br1d + } // if b_is_r1d // ****************************************************************** // Check 6/6: Checks on InternalTypeA, X, B, Y // ****************************************************************** - if constexpr (atHostCase) { + if constexpr (onHost) { if constexpr (xyRank1Case) { constexpr bool internalTypeA_isOk = - (internalTypeA_sc || internalTypeA_r1d); + (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeA is wrong"); + constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeX is wrong"); + constexpr bool internalTypeB_isOk = - (internalTypeB_sc || internalTypeB_r1d); + (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeB is wrong"); + constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - if constexpr (internalTypeA_isOk && internalTypeX_isOk && - internalTypeB_isOk && internalTypeY_isOk) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " - "6.1/6" - << ", invalid internal types" - << ": atHostCase = " << atHostCase - << ", atDevCase = " << atDevCase - << ", xyRank1Case= " << xyRank1Case - << ", xyRank2Case= " << xyRank2Case - << ", InternalTypeA = " << typeid(InternalTypeA).name() - << ", InternalTypeX = " << typeid(InternalTypeX).name() - << ", InternalTypeB = " << typeid(InternalTypeB).name() - << ", InternalTypeY = " << typeid(InternalTypeY).name(); - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeY is wrong"); } else { constexpr bool internalTypeA_isOk = - (internalTypeA_sc || internalTypeA_r1d); + (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeA is wrong"); + constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeX is wrong"); + constexpr bool internalTypeB_isOk = - (internalTypeB_sc || internalTypeB_r1d); + (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeB is wrong"); + constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - if constexpr (internalTypeA_isOk && internalTypeX_isOk && - internalTypeB_isOk && internalTypeY_isOk) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " - "6.2/6" - << ", invalid internal types" - << ": atHostCase = " << atHostCase - << ", atDevCase = " << atDevCase - << ", xyRank1Case= " << xyRank1Case - << ", xyRank2Case= " << xyRank2Case - << ", InternalTypeA = " << typeid(InternalTypeA).name() - << ", InternalTypeX = " << typeid(InternalTypeX).name() - << ", InternalTypeB = " << typeid(InternalTypeB).name() - << ", InternalTypeY = " << typeid(InternalTypeY).name(); - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeY is wrong"); } } else { if constexpr (xyRank1Case) { - constexpr bool internalTypeA_isOk = internalTypeA_r1d; + constexpr bool internalTypeA_isOk = internalTypeA_is_r1d; + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeA is wrong"); + constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; - constexpr bool internalTypeB_isOk = internalTypeB_r1d; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = internalTypeB_is_r1d; + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeB is wrong"); + constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - if constexpr (internalTypeA_isOk && internalTypeX_isOk && - internalTypeB_isOk && internalTypeY_isOk) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " - "6.3/6" - << ", invalid internal types" - << ": atHostCase = " << atHostCase - << ", atDevCase = " << atDevCase - << ", xyRank1Case= " << xyRank1Case - << ", xyRank2Case= " << xyRank2Case - << ", InternalTypeA = " << typeid(InternalTypeA).name() - << ", InternalTypeX = " << typeid(InternalTypeX).name() - << ", InternalTypeB = " << typeid(InternalTypeB).name() - << ", InternalTypeY = " << typeid(InternalTypeY).name(); - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeY is wrong"); } else { - constexpr bool internalTypeA_isOk = internalTypeA_r1d; + constexpr bool internalTypeA_isOk = internalTypeA_is_r1d; + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeA is wrong"); + constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; - constexpr bool internalTypeB_isOk = internalTypeB_r1d; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = internalTypeB_is_r1d; + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeB is wrong"); + constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - if constexpr (internalTypeA_isOk && internalTypeX_isOk && - internalTypeB_isOk && internalTypeY_isOk) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check " - "6.4/6" - << ", invalid internal types" - << ": atHostCase = " << atHostCase - << ", atDevCase = " << atDevCase - << ", xyRank1Case= " << xyRank1Case - << ", xyRank2Case= " << xyRank2Case - << ", InternalTypeA = " << typeid(InternalTypeA).name() - << ", InternalTypeX = " << typeid(InternalTypeX).name() - << ", InternalTypeB = " << typeid(InternalTypeB).name() - << ", InternalTypeY = " << typeid(InternalTypeY).name(); - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeY is wrong"); } } - if constexpr (atHostCase) { + if constexpr (onHost) { // **************************************************************** - // We are in the 'atHostCase' case, with 2 possible subcases:: + // We are in the 'onHost' case, with 2 possible subcases:: // // 1) xyRank1Case, with the following possible situations: // - [InternalTypeA, B] = [S_a, S_b], or @@ -761,12 +658,12 @@ struct AxpbyUnificationAttemptTraits { // **************************************************************** static_assert( internalTypesAB_bothScalars || internalTypesAB_bothViews, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), atHostCase, " - "invalid combination of types"); - } // If atHostCase - else if constexpr (atDevCase) { + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, invalid combination of types"); + } // If onHost + else if constexpr (onDevice) { // **************************************************************** - // We are in the 'atDevCase' case, with 2 possible subcases: + // We are in the 'onDevice' case, with 2 possible subcases: // // 1) xyRank1Case, with only one possible situation: // - [InternalTypeA / B] = [view, view] @@ -778,47 +675,34 @@ struct AxpbyUnificationAttemptTraits { // **************************************************************** static_assert( internalTypesAB_bothViews, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), atDevCase, " - "invalid combination of types"); + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, invalid combination of types"); } - if constexpr (xyRank2Case && (Ar1d || Ar1s) && atInputLayoutA_isStride) { - if (std::is_same_v< - typename getLayoutFromView< - InternalTypeA, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 6.5/6" - << ", xyRank2Case = " << xyRank2Case - << ", coeff 'a' is rank-1 and has LayoutStride at input, but no " - "LayoutStride internally" - << ", AV = " << typeid(AV).name() - << ", InternalTypeA = " << typeid(InternalTypeA).name(); - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && atInputLayoutA_isStride) { + static_assert(std::is_same_v< + typename getLayoutFromView< + InternalTypeA, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } - if constexpr (xyRank2Case && (Br1d || Br1s) && atInputLayoutB_isStride) { - if (std::is_same_v< - typename getLayoutFromView< - InternalTypeB, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>) { - // Ok - } else { - std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits(), check 6.6/6" - << ", xyRank2Case = " << xyRank2Case - << ", coeff 'a' is rank-1 and has LayoutStride at input, but no " - "LayoutStride internally" - << ", BV = " << typeid(BV).name() - << ", InternalTypeB = " << typeid(InternalTypeB).name(); - KokkosKernels::Impl::throw_runtime_exception(msg.str()); - } + if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && atInputLayoutB_isStride) { + static_assert(std::is_same_v< + typename getLayoutFromView< + InternalTypeB, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } } // Constructor + // ******************************************************************** + // Routine to print information on input variables and internal variables + // ******************************************************************** static void printInformation(std::ostream& os, std::string const& headerMsg) { os << headerMsg << ": AV = " << typeid(AV).name() @@ -828,7 +712,7 @@ struct AxpbyUnificationAttemptTraits { << ", AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() << ", isConst = " << std::is_const_v << ", isComplex = " - << atInputScalarTypeA_isComplex << ", AtInputScalarTypeA_nonConst = " + << Kokkos::ArithTraits::is_complex << ", AtInputScalarTypeA_nonConst = " << typeid(AtInputScalarTypeA_nonConst).name() << ", InternalTypeA = " << typeid(InternalTypeA).name() << "\n" << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() @@ -843,7 +727,7 @@ struct AxpbyUnificationAttemptTraits { << typeid(typename XMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " << atInputScalarTypeX_isComplex << "\n" + << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" << "AtInputScalarTypeX_nonConst = " << typeid(AtInputScalarTypeX_nonConst).name() << "\n" << "InternalTypeX = " << typeid(InternalTypeX).name() << "\n" @@ -856,7 +740,7 @@ struct AxpbyUnificationAttemptTraits { << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() << ", isConst = " << std::is_const_v << ", isComplex = " - << atInputScalarTypeB_isComplex << ", AtInputScalarTypeB_nonConst = " + << Kokkos::ArithTraits::is_complex << ", AtInputScalarTypeB_nonConst = " << typeid(AtInputScalarTypeB_nonConst).name() << ", InternalTypeB = " << typeid(InternalTypeB).name() << "\n" << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() @@ -871,7 +755,7 @@ struct AxpbyUnificationAttemptTraits { << typeid(typename YMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " << atInputScalarTypeY_isComplex << "\n" + << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" << "AtInputScalarTypeY_nonConst = " << typeid(AtInputScalarTypeY_nonConst).name() << "\n" << "InternalTypeY = " << typeid(InternalTypeY).name() << "\n" diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index de1727d817..1a95f06191 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -43,7 +43,7 @@ namespace KokkosBlas { /// \tparam BV Scalar or 0-D or 1-D Kokkos::View. /// \tparam YMV 1-D or 2-D Kokkos::View. /// -/// \param pExecSpace [in] The execution space instance on which the kernel +/// \param exec_space [in] The execution space instance on which the kernel /// will run. /// \param a [in] Input of type AV: /// - scaling parameter for 1-D or 2-D X, @@ -56,7 +56,7 @@ namespace KokkosBlas { /// \param Y [in/out] View of type YMV in which the results will be /// stored. template -void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, +void axpby(const execution_space& exec_space, const AV& a, const XMV& X, const BV& b, const YMV& Y) { using AxpbyTraits = Impl::AxpbyUnificationAttemptTraits; @@ -84,7 +84,7 @@ void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, BV, Impl::typeRank()>::getValue(b)); Impl::Axpby::axpby(pExecSpace, internal_a, internal_X, + InternalTypeY>::axpby(exec_space, internal_a, internal_X, internal_b, internal_Y); } else if constexpr (AxpbyTraits::internalTypesAB_bothViews) { constexpr bool internalLayoutA_isStride( @@ -94,14 +94,14 @@ void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, std::is_same_v); - const size_t k_a(Impl::getAmountOfScalarsInCoefficient(a)); - const size_t k_b(Impl::getAmountOfScalarsInCoefficient(b)); + const size_t numScalarsA(Impl::getAmountOfScalarsInCoefficient(a)); + const size_t numScalarsB(Impl::getAmountOfScalarsInCoefficient(b)); - const size_t s_a(Impl::getStrideInCoefficient(a)); - const size_t s_b(Impl::getStrideInCoefficient(b)); + const size_t strideA(Impl::getStrideInCoefficient(a)); + const size_t strideB(Impl::getStrideInCoefficient(b)); - Kokkos::LayoutStride layoutStrideA{k_a, s_a}; - Kokkos::LayoutStride layoutStrideB{k_b, s_b}; + Kokkos::LayoutStride layoutStrideA{numScalarsA, strideA}; + Kokkos::LayoutStride layoutStrideB{numScalarsB, strideB}; InternalTypeA internal_a; InternalTypeB internal_b; @@ -136,14 +136,15 @@ void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, // Call Impl::Axpby<...>::axpby(...) // **************************************************************** Impl::Axpby::axpby(pExecSpace, internal_a, + InternalTypeB, InternalTypeY>::axpby(exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", k_b); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + numScalarsB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -155,16 +156,16 @@ void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, // Call Impl::Axpby<...>::axpby(...) // **************************************************************** Impl::Axpby::axpby(pExecSpace, internal_a, + InternalTypeB, InternalTypeY>::axpby(exec_space, internal_a, internal_X, internal_b, internal_Y); } - } else { // ****************************************************************** // Prepare internal_a // ****************************************************************** - typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", k_a); + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", + numScalarsA); if constexpr (AxpbyTraits::atInputLayoutA_isStride) { Kokkos::deep_copy(managed_a, a); } else { @@ -189,14 +190,15 @@ void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, // Call Impl::Axpby<...>::axpby(...) // **************************************************************** Impl::Axpby::axpby(pExecSpace, internal_a, + InternalTypeB, InternalTypeY>::axpby(exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", k_b); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + numScalarsB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -208,7 +210,7 @@ void axpby(const execution_space& pExecSpace, const AV& a, const XMV& X, // Call Impl::Axpby<...>::axpby(...) // **************************************************************** Impl::Axpby::axpby(pExecSpace, internal_a, + InternalTypeB, InternalTypeY>::axpby(exec_space, internal_a, internal_X, internal_b, internal_Y); } @@ -254,7 +256,7 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { /// the same rank as YMV. /// \tparam YMV 1-D or 2-D Kokkos::View. /// -/// \param pExecSpace [in] The execution space instance on which the kernel +/// \param exec_space [in] The execution space instance on which the kernel /// will run. /// \param a [in] Input of type AV: /// - scaling parameter for 1-D or 2-D X, @@ -264,9 +266,9 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { /// \param Y [in/out] View of type YMV in which the results will be /// stored. template -void axpy(const execution_space& pExecSpace, const AV& a, const XMV& X, +void axpy(const execution_space& exec_space, const AV& a, const XMV& X, const YMV& Y) { - axpby(pExecSpace, a, X, + axpby(exec_space, a, X, Kokkos::ArithTraits::one(), Y); } diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 4396c81bb2..86b94c0bd1 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -198,8 +198,8 @@ SyrTester::SyrTester() // large enough to require 'relTol' to value 5.0e-3. The same // calculations show no discrepancies for calculations with double. // **************************************************************** - _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), From ce072b8df7a47789abc11785adc213a32b114cad Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 1 Aug 2023 09:27:42 -0600 Subject: [PATCH 018/326] Formatting --- blas/impl/KokkosBlas1_axpby_impl.hpp | 16 +- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 53 ++- blas/impl/KokkosBlas1_axpby_spec.hpp | 12 +- ...Blas1_axpby_unification_attempt_traits.hpp | 343 ++++++++++-------- blas/unit_test/Test_Blas2_syr.hpp | 8 +- 5 files changed, 245 insertions(+), 187 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index 14856b3bb7..29a72c19d5 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -90,8 +90,8 @@ struct Axpby_Functor { static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": XV and YV must have rank 1."); - static_assert((-1 <= scalar_x) && (scalar_x <= 2) && - (-1 <= scalar_y) && (scalar_y <= 2), + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2), "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { @@ -222,8 +222,8 @@ struct Axpby_Functor policy(space, 0, numRows); @@ -1277,13 +1277,13 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Generic()" ": XMV and YMV must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && - (-1 <= scalar_y) && (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( - "KokkosBlas::Impl::Axpby_MV_Generic()" - ": scalar_x and/or scalar_y are out of range."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": scalar_x and/or scalar_y are out of range."); } const SizeType numRows = x.extent(0); @@ -1405,13 +1405,13 @@ struct Axpby_MV_Invoke_Left { static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": X and Y must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && - (-1 <= scalar_y) && (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( - "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" - ": scalar_x and/or scalar_y are out of range."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": scalar_x and/or scalar_y are out of range."); } const SizeType numCols = x.extent(1); @@ -1497,16 +1497,15 @@ struct Axpby_MV_Invoke_Right { static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": X and Y must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && - (-1 <= scalar_y) && (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( - "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" - ": scalar_x and/or scalar_y are out of range."); - + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": scalar_x and/or scalar_y are out of range."); } - + const SizeType numCols = x.extent(1); if (numCols == 1) { auto x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index d36c65f135..3aff21e0be 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -226,8 +226,7 @@ struct Axpby; if (av == ATA::zero()) { scalar_x = 0; @@ -245,8 +244,7 @@ struct Axpby; if (bv == ATB::zero()) { scalar_y = 0; @@ -404,8 +402,7 @@ struct Axpby; if (av == ATA::zero()) { scalar_x = 0; @@ -423,8 +420,7 @@ struct Axpby; if (bv == ATB::zero()) { scalar_y = 0; diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 91cd591458..f7eccf160f 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -104,23 +104,23 @@ struct AxpbyUnificationAttemptTraits { // - variable names begin with lower case letters // - type names begin with upper case letters // ******************************************************************** -private: + private: static constexpr bool onDevice = KokkosKernels::Impl::kk_is_gpu_exec_space(); static constexpr bool onHost = !onDevice; static constexpr bool a_is_scalar = !Kokkos::is_view_v; - static constexpr bool a_is_r0 = Tr0_val(); - static constexpr bool a_is_r1s = Tr1s_val(); - static constexpr bool a_is_r1d = Tr1d_val(); + static constexpr bool a_is_r0 = Tr0_val(); + static constexpr bool a_is_r1s = Tr1s_val(); + static constexpr bool a_is_r1d = Tr1d_val(); static constexpr bool x_is_r1 = Kokkos::is_view_v && (XMV::rank == 1); static constexpr bool x_is_r2 = Kokkos::is_view_v && (XMV::rank == 2); static constexpr bool b_is_scalar = !Kokkos::is_view_v; - static constexpr bool b_is_r0 = Tr0_val(); - static constexpr bool b_is_r1s = Tr1s_val(); - static constexpr bool b_is_r1d = Tr1d_val(); + static constexpr bool b_is_r0 = Tr0_val(); + static constexpr bool b_is_r1s = Tr1s_val(); + static constexpr bool b_is_r1d = Tr1d_val(); static constexpr bool y_is_r1 = Kokkos::is_view_v && (YMV::rank == 1); static constexpr bool y_is_r2 = Kokkos::is_view_v && (YMV::rank == 2); @@ -132,12 +132,14 @@ struct AxpbyUnificationAttemptTraits { // Declare 'AtInputScalarTypeA_nonConst' // ******************************************************************** using ScalarTypeA2_onDevice = - typename getScalarTypeFromView::type; + typename getScalarTypeFromView::type; using ScalarTypeA1_onDevice = std::conditional_t; using ScalarTypeA2_onHost = - typename getScalarTypeFromView::type; + typename getScalarTypeFromView::type; using ScalarTypeA1_onHost = std::conditional_t; @@ -159,12 +161,14 @@ struct AxpbyUnificationAttemptTraits { // Declare 'AtInputScalarTypeB_nonConst' // ******************************************************************** using ScalarTypeB2_onDevice = - typename getScalarTypeFromView::type; + typename getScalarTypeFromView::type; using ScalarTypeB1_onDevice = std::conditional_t; using ScalarTypeB2_onHost = - typename getScalarTypeFromView::type; + typename getScalarTypeFromView::type; using ScalarTypeB1_onHost = std::conditional_t; @@ -194,11 +198,14 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'InternalTypeA_tmp' // ******************************************************************** - using AtInputLayoutA = typename getLayoutFromView::type; -public: + using AtInputLayoutA = + typename getLayoutFromView::type; + + public: static constexpr bool atInputLayoutA_isStride = std::is_same_v; -private: + + private: using InternalLayoutA = std::conditional_t<(a_is_r1d || a_is_r1s) && atInputLayoutA_isStride, AtInputLayoutA, InternalLayoutX>; @@ -233,7 +240,7 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'InternalTypeX' // ******************************************************************** -public: + public: using InternalTypeX = std::conditional_t< x_is_r2, Kokkos::View::type; -public: + private: + using AtInputLayoutB = + typename getLayoutFromView::type; + + public: static constexpr bool atInputLayoutB_isStride = std::is_same_v; -private: + + private: using InternalLayoutB = std::conditional_t<(b_is_r1d || b_is_r1s) && atInputLayoutB_isStride, AtInputLayoutB, InternalLayoutY>; @@ -286,7 +296,7 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'InternalTypeY' // ******************************************************************** -public: + public: using InternalTypeY = std::conditional_t< y_is_r2, Kokkos::View, Kokkos::View, @@ -337,10 +348,11 @@ struct AxpbyUnificationAttemptTraits { // Declare 'InternalTypeB_managed' with the same scalar type in // 'InternalTypeB' // ******************************************************************** -private: + private: using InternalLayoutB_managed = InternalLayoutB; -public: - using InternalTypeB_managed = std::conditional_t< + + public: + using InternalTypeB_managed = std::conditional_t< Kokkos::is_view_v, Kokkos::View, @@ -349,14 +361,16 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Auxiliary Boolean results on internal types // ******************************************************************** -private: - static constexpr bool internalTypeA_is_scalar = !Kokkos::is_view_v; + private: + static constexpr bool internalTypeA_is_scalar = + !Kokkos::is_view_v; static constexpr bool internalTypeA_is_r1d = Tr1d_val(); - static constexpr bool internalTypeB_is_scalar = !Kokkos::is_view_v; + static constexpr bool internalTypeB_is_scalar = + !Kokkos::is_view_v; static constexpr bool internalTypeB_is_r1d = Tr1d_val(); -public: + public: static constexpr bool internalTypesAB_bothScalars = (internalTypeA_is_scalar && internalTypeB_is_scalar); static constexpr bool internalTypesAB_bothViews = @@ -370,34 +384,44 @@ struct AxpbyUnificationAttemptTraits { // ****************************************************************** // Check 1/6: General checks // ****************************************************************** - static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": tExecSpace must be a valid Kokkos execution space."); - - static_assert((xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": one must have either both X and Y as rank 1, or both X and Y as rank 2"); - - if constexpr (Kokkos::ArithTraits::is_complex == false) { - static_assert((Kokkos::ArithTraits::is_complex == false) && - (Kokkos::ArithTraits::is_complex == false) && - (Kokkos::ArithTraits::is_complex == false), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": if Y is not complex, then A, X and B cannot be complex"); + static_assert( + Kokkos::is_execution_space_v, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": tExecSpace must be a valid Kokkos execution space."); + + static_assert( + (xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": one must have either both X and Y as rank 1, or both X and Y as " + "rank 2"); + + if constexpr (Kokkos::ArithTraits< + AtInputScalarTypeY_nonConst>::is_complex == false) { + static_assert( + (Kokkos::ArithTraits::is_complex == + false) && + (Kokkos::ArithTraits::is_complex == + false) && + (Kokkos::ArithTraits::is_complex == + false), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": if Y is not complex, then A, X and B cannot be complex"); } // ****************************************************************** // Check 2/6: YMV is valid // ****************************************************************** - static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": Y is const. It must be nonconst, " - "because it is an output argument " - "(we must be able to write to its entries)."); + static_assert( + Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is not a Kokkos::View."); + static_assert( + std::is_same::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is const. It must be nonconst, " + "because it is an output argument " + "(we must be able to write to its entries)."); static_assert( Kokkos::SpaceAccessibility::accessible, @@ -407,9 +431,10 @@ struct AxpbyUnificationAttemptTraits { // ****************************************************************** // Check 3/6: XMV is valid // ****************************************************************** - static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": X is not a Kokkos::View."); + static_assert( + Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": X is not a Kokkos::View."); static_assert( Kokkos::SpaceAccessibility::accessible, @@ -421,7 +446,8 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" + ")" << ", invalid rank-1 X extent" << ": X.extent(0) = " << X.extent(0) << ", Y.extent(0) = " << Y.extent(0); @@ -432,7 +458,8 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" + ")" << ", invalid rank-2 X extents" << ": X.extent(0) = " << X.extent(0) << ", X.extent(1) = " << X.extent(1) @@ -445,12 +472,14 @@ struct AxpbyUnificationAttemptTraits { // ****************************************************************** // Check 4/6: AV is valid // ****************************************************************** - static_assert(( a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || - (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || - (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || - (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": 'a' must be either scalar or rank 0 or rank 1 static or rank 1 dynamic"); + static_assert( + (a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": 'a' must be either scalar or rank 0 or rank 1 static or rank 1 " + "dynamic"); if constexpr (a_is_r1d || a_is_r1s) { if constexpr (xyRank1Case) { @@ -458,7 +487,8 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" << ": view 'a' must have extent(0) == 1 for xyRank1Case" << ", a.extent(0) = " << a.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); @@ -469,7 +499,8 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" << ": view 'a' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" << ", a.extent(0) = " << a.extent(0) @@ -483,12 +514,14 @@ struct AxpbyUnificationAttemptTraits { // ****************************************************************** // Check 5/6: BV is valid // ****************************************************************** - static_assert(( b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || - (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || - (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || - (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": 'b' must be either scalar or rank 0 or rank 1 static or rank 1 dynamic"); + static_assert( + (b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": 'b' must be either scalar or rank 0 or rank 1 static or rank 1 " + "dynamic"); if constexpr (b_is_r1d || b_is_r1s) { if constexpr (xyRank1Case) { @@ -496,7 +529,8 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" << ": view 'b' must have extent(0) == 1 for xyRank1Case" << ", b.extent(0) = " << b.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); @@ -506,7 +540,8 @@ struct AxpbyUnificationAttemptTraits { // Ok } else { std::ostringstream msg; - msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" << ": view 'b' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" << ", b.extent(0) = " << b.extent(0) @@ -524,121 +559,137 @@ struct AxpbyUnificationAttemptTraits { if constexpr (xyRank1Case) { constexpr bool internalTypeA_isOk = (internalTypeA_is_scalar || internalTypeA_is_r1d); - static_assert(internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeA is wrong"); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeA is wrong"); constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; - static_assert(internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeX is wrong"); + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = (internalTypeB_is_scalar || internalTypeB_is_r1d); - static_assert(internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeB is wrong"); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeB is wrong"); constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - static_assert(internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeY is wrong"); + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeY is wrong"); } else { constexpr bool internalTypeA_isOk = (internalTypeA_is_scalar || internalTypeA_is_r1d); - static_assert(internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeA is wrong"); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeA is wrong"); constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; - static_assert(internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeX is wrong"); + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = (internalTypeB_is_scalar || internalTypeB_is_r1d); - static_assert(internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeB is wrong"); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeB is wrong"); constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - static_assert(internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeY is wrong"); + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeY is wrong"); } } else { if constexpr (xyRank1Case) { constexpr bool internalTypeA_isOk = internalTypeA_is_r1d; - static_assert(internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeA is wrong"); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeA is wrong"); constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; - static_assert(internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeX is wrong"); + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = internalTypeB_is_r1d; - static_assert(internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeB is wrong"); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeB is wrong"); constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - static_assert(internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeY is wrong"); + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeY is wrong"); } else { constexpr bool internalTypeA_isOk = internalTypeA_is_r1d; - static_assert(internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeA is wrong"); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeA is wrong"); constexpr bool internalTypeX_isOk = std::is_same_v< InternalTypeX, Kokkos::View>>; - static_assert(internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeX is wrong"); + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = internalTypeB_is_r1d; - static_assert(internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeB is wrong"); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeB is wrong"); constexpr bool internalTypeY_isOk = std::is_same_v< InternalTypeY, Kokkos::View>>; - static_assert(internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeY is wrong"); + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeY is wrong"); } } @@ -679,24 +730,28 @@ struct AxpbyUnificationAttemptTraits { ", onDevice, invalid combination of types"); } - if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && atInputLayoutA_isStride) { - static_assert(std::is_same_v< - typename getLayoutFromView< - InternalTypeA, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" - ", but no LayoutStride internally"); + if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && + atInputLayoutA_isStride) { + static_assert( + std::is_same_v< + typename getLayoutFromView< + InternalTypeA, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } - if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && atInputLayoutB_isStride) { - static_assert(std::is_same_v< - typename getLayoutFromView< - InternalTypeB, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" - ", but no LayoutStride internally"); + if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && + atInputLayoutB_isStride) { + static_assert( + std::is_same_v< + typename getLayoutFromView< + InternalTypeB, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } } // Constructor @@ -712,7 +767,8 @@ struct AxpbyUnificationAttemptTraits { << ", AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() << ", isConst = " << std::is_const_v << ", isComplex = " - << Kokkos::ArithTraits::is_complex << ", AtInputScalarTypeA_nonConst = " + << Kokkos::ArithTraits::is_complex + << ", AtInputScalarTypeA_nonConst = " << typeid(AtInputScalarTypeA_nonConst).name() << ", InternalTypeA = " << typeid(InternalTypeA).name() << "\n" << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() @@ -727,7 +783,8 @@ struct AxpbyUnificationAttemptTraits { << typeid(typename XMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" + << "isComplex = " + << Kokkos::ArithTraits::is_complex << "\n" << "AtInputScalarTypeX_nonConst = " << typeid(AtInputScalarTypeX_nonConst).name() << "\n" << "InternalTypeX = " << typeid(InternalTypeX).name() << "\n" @@ -740,7 +797,8 @@ struct AxpbyUnificationAttemptTraits { << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() << ", isConst = " << std::is_const_v << ", isComplex = " - << Kokkos::ArithTraits::is_complex << ", AtInputScalarTypeB_nonConst = " + << Kokkos::ArithTraits::is_complex + << ", AtInputScalarTypeB_nonConst = " << typeid(AtInputScalarTypeB_nonConst).name() << ", InternalTypeB = " << typeid(InternalTypeB).name() << "\n" << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() @@ -755,7 +813,8 @@ struct AxpbyUnificationAttemptTraits { << typeid(typename YMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" + << "isComplex = " + << Kokkos::ArithTraits::is_complex << "\n" << "AtInputScalarTypeY_nonConst = " << typeid(AtInputScalarTypeY_nonConst).name() << "\n" << "InternalTypeY = " << typeid(InternalTypeY).name() << "\n" diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 86b94c0bd1..6c2651c47e 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -198,8 +198,12 @@ SyrTester::SyrTester() // large enough to require 'relTol' to value 5.0e-3. The same // calculations show no discrepancies for calculations with double. // **************************************************************** - _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value + ? 1.0e-6 + : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value + ? 5.0e-3 + : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), From f4c5351d08355ad8ecd9d357cbc700e0de20bf9e Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 1 Aug 2023 10:24:11 -0600 Subject: [PATCH 019/326] Using 'ifdef HAVE_KOKKOSKERNELS_DEBUG', per Luc's suggestion --- blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp | 3 +++ blas/src/KokkosBlas1_axpby.hpp | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index f7eccf160f..a1b7b19c2c 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -758,6 +758,7 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Routine to print information on input variables and internal variables // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG static void printInformation(std::ostream& os, std::string const& headerMsg) { os << headerMsg << ": AV = " << typeid(AV).name() @@ -820,6 +821,8 @@ struct AxpbyUnificationAttemptTraits { << "InternalTypeY = " << typeid(InternalTypeY).name() << "\n" << std::endl; } +#endif + }; // struct AxpbyUnificationAttemptTraits // -------------------------------- diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 1a95f06191..f371ab0b51 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -69,7 +69,9 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // Perform compile time checks and run time checks. // ********************************************************************** AxpbyTraits::performChecks(a, X, b, Y); - // AxpbyTraits::printInformation(std::cout, "axpby(), unif information"); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + AxpbyTraits::printInformation(std::cout, "axpby(), unif information"); +#endif // ********************************************************************** // Call Impl::Axpby<...>::axpby(...) From f767097e7ab4c3e656f2fc9e51cbdf5a4d6a1945 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 17 Oct 2023 15:27:16 -0600 Subject: [PATCH 020/326] Addressing feedbacks from Luc --- ...Blas1_axpby_unification_attempt_traits.hpp | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index a1b7b19c2c..188d9a2b35 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -154,8 +154,7 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** using AtInputScalarTypeX = typename XMV::value_type; - using AtInputScalarTypeX_nonConst = - typename std::remove_const::type; + using AtInputScalarTypeX_nonConst = typename XMV::non_const_value_type; // ******************************************************************** // Declare 'AtInputScalarTypeB_nonConst' @@ -183,8 +182,7 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** using AtInputScalarTypeY = typename YMV::value_type; - using AtInputScalarTypeY_nonConst = - typename std::remove_const::type; + using AtInputScalarTypeY_nonConst = typename YMV::non_const_value_type; // ******************************************************************** // Declare 'InternalLayoutX' and 'InternalLayoutY' @@ -395,15 +393,11 @@ struct AxpbyUnificationAttemptTraits { ": one must have either both X and Y as rank 1, or both X and Y as " "rank 2"); - if constexpr (Kokkos::ArithTraits< - AtInputScalarTypeY_nonConst>::is_complex == false) { + if constexpr (!Kokkos::ArithTraits::is_complex) { static_assert( - (Kokkos::ArithTraits::is_complex == - false) && - (Kokkos::ArithTraits::is_complex == - false) && - (Kokkos::ArithTraits::is_complex == - false), + (!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": if Y is not complex, then A, X and B cannot be complex"); } @@ -442,9 +436,7 @@ struct AxpbyUnificationAttemptTraits { ": XMV must be accessible from tExecSpace"); if constexpr (xyRank1Case) { - if (X.extent(0) == Y.extent(0)) { - // Ok - } else { + if (X.extent(0) != Y.extent(0)) { std::ostringstream msg; msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" ")" @@ -454,9 +446,7 @@ struct AxpbyUnificationAttemptTraits { KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { - if ((X.extent(0) == Y.extent(0)) && (X.extent(1) == Y.extent(1))) { - // Ok - } else { + if ((X.extent(0) != Y.extent(0)) || (X.extent(1) != Y.extent(1))) { std::ostringstream msg; msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" ")" @@ -483,9 +473,7 @@ struct AxpbyUnificationAttemptTraits { if constexpr (a_is_r1d || a_is_r1s) { if constexpr (xyRank1Case) { - if (a.extent(0) == 1) { - // Ok - } else { + if (a.extent(0) != 1) { std::ostringstream msg; msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" "performChecks()" @@ -525,9 +513,7 @@ struct AxpbyUnificationAttemptTraits { if constexpr (b_is_r1d || b_is_r1s) { if constexpr (xyRank1Case) { - if (b.extent(0) == 1) { - // Ok - } else { + if (b.extent(0) != 1) { std::ostringstream msg; msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" "performChecks()" From 61ac8207021b6878c9f80e6881b70a84854ae930 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 17 Oct 2023 16:08:37 -0600 Subject: [PATCH 021/326] Correcting compilation errors in my Mac --- .../Test_Blas1_axpby_unification.hpp | 20 +++++++++---------- blas/unit_test/Test_Blas2_ger.hpp | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index e468b513b0..6b2e5a3f5c 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -2126,12 +2126,12 @@ int test_axpby_mv_unification() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_float"); - test_axpby_unification(); + test_axpby_unification(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_unification_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_float"); - test_axpby_mv_unification(); + test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif @@ -2141,12 +2141,12 @@ TEST_F(TestCategory, axpby_mv_unification_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double"); - test_axpby_unification(); + test_axpby_unification(); } TEST_F(TestCategory, axpby_mv_unification_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_double"); - test_axpby_mv_unification(); + test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif @@ -2159,7 +2159,7 @@ TEST_F(TestCategory, axpby_unification_complex_double) { "KokkosBlas::Test::axpby_unification_complex_double"); test_axpby_unification, Kokkos::complex, Kokkos::complex, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_unification_complex_double) { @@ -2167,7 +2167,7 @@ TEST_F(TestCategory, axpby_mv_unification_complex_double) { "KokkosBlas::Test::axpby_mv_unification_complex_double"); test_axpby_mv_unification, Kokkos::complex, Kokkos::complex, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -2177,12 +2177,12 @@ TEST_F(TestCategory, axpby_mv_unification_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_int"); - test_axpby_unification(); + test_axpby_unification(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_unification_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_int"); - test_axpby_mv_unification(); + test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif @@ -2192,13 +2192,13 @@ TEST_F(TestCategory, axpby_mv_unification_int) { TEST_F(TestCategory, axpby_unification_double_int) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_unification_double_int"); - test_axpby_unification(); + test_axpby_unification(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_double_mv_unification_int) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_double_int"); - test_axpby_mv_unification(); + test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index a0860bae04..b32b0e1eaf 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -195,8 +195,8 @@ GerTester::value ? 1.0e-6 : 1.0e-9), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), From 12d1fd49e2158c85fa801f4fc1492a2cb27b387c Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 17 Oct 2023 16:20:18 -0600 Subject: [PATCH 022/326] Backup --- .../impl/KokkosBlas1_axpby_unification_attempt_traits.hpp | 7 ++++--- blas/unit_test/Test_Blas2_ger.hpp | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 188d9a2b35..49172a4c10 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -393,11 +393,12 @@ struct AxpbyUnificationAttemptTraits { ": one must have either both X and Y as rank 1, or both X and Y as " "rank 2"); - if constexpr (!Kokkos::ArithTraits::is_complex) { + if constexpr (!Kokkos::ArithTraits< + AtInputScalarTypeY_nonConst>::is_complex) { static_assert( (!Kokkos::ArithTraits::is_complex) && - (!Kokkos::ArithTraits::is_complex) && - (!Kokkos::ArithTraits::is_complex), + (!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": if Y is not complex, then A, X and B cannot be complex"); } diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index b32b0e1eaf..7d30a4b65d 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -195,8 +195,12 @@ GerTester::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value + ? 1.0e-6 + : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value + ? 5.0e-3 + : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), From 3fc73c942cfa33777b0cb940d6b0ffeff3e6b930 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 23 Oct 2023 09:23:47 -0600 Subject: [PATCH 023/326] SYR2: fix unit-test type issue On KokkosEco_Trilinos_Weaver_CUDA112_opt-uvm the SYR2 test enerates a compile time error probably due to a mixed use of host and device views when comparing implemented vs. reference results. --- blas/unit_test/Test_Blas2_syr2.hpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 080c106b9f..3d1a5a6d24 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -70,18 +70,17 @@ class Syr2Tester { const bool useUpOption = false); private: - typedef Kokkos::View _ViewTypeX; - typedef Kokkos::View _ViewTypeY; - typedef Kokkos::View _ViewTypeA; + using _ViewTypeX = Kokkos::View; + using _ViewTypeY = Kokkos::View; + using _ViewTypeA = Kokkos::View; - typedef typename _ViewTypeX::HostMirror _HostViewTypeX; - typedef typename _ViewTypeY::HostMirror _HostViewTypeY; - typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View - _ViewTypeExpected; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; - typedef Kokkos::ArithTraits _KAT_A; - typedef typename _KAT_A::mag_type _AuxType; + using _KAT_A = Kokkos::ArithTraits; + using _AuxType = typename _KAT_A::mag_type; void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, @@ -1661,6 +1660,7 @@ void Syr2Tester h_ger_reference( "h_ger_reference", _M, _N); Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); + Kokkos::deep_copy(h_ger_reference.h_base, h_ger_reference.d_base); std::string uplo = _useUpOption ? "U" : "L"; for (int i = 0; i < _M; ++i) { @@ -1669,22 +1669,22 @@ void Syr2Tester= j))) { // Keep h_ger_reference as already computed } else { - h_ger_reference.d_view(i, j) = org_A.h_view(i, j); + h_ger_reference.h_view(i, j) = org_A.h_view(i, j); } } } if (_useHermitianOption && _A_is_complex) { for (int i(0); i < _N; ++i) { - h_ger_reference.d_view(i, i) = - 0.5 * (h_ger_reference.d_view(i, i) + - _KAT_A::conj(h_ger_reference.d_view(i, i))); + h_ger_reference.h_view(i, i) = + 0.5 * (h_ger_reference.h_view(i, i) + + _KAT_A::conj(h_ger_reference.h_view(i, i))); } } // ******************************************************************** // Compare // ******************************************************************** - this->compareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.d_view); + this->compareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.h_view); } } // namespace Test From 6790651e376db6ea2a98860f775a0c6c3befd67d Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 23 Oct 2023 13:07:53 -0600 Subject: [PATCH 024/326] CUDA 11.0.1 / cuSPARSE 11.0.0 changed SpMM enums --- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 30e0b6e243..f28e04e26b 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -157,8 +157,9 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, cusparseOperation_t opB = xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; -// CUSPARSE_MM_ALG_DEFAULT was deprecated as early as 11.1 (maybe earlier) -#if CUSPARSE_VERSION < 11010 +// CUSPARSE_MM_ALG_DEFAULT was deprecated in CUDA 11.0.1 / cuSPARSE 11.0.0 and +// removed in CUDA 12.0.0 / cuSPARSE 12.0.0 +#if CUSPARSE_VERSION < 11000 const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; #else const cusparseSpMMAlg_t alg = CUSPARSE_SPMM_ALG_DEFAULT; From c749f8c4d349e3917d5d36e0159545948df23f70 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 23 Oct 2023 13:43:15 -0600 Subject: [PATCH 025/326] SYR2: applying clang-format --- blas/unit_test/Test_Blas2_syr2.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 3d1a5a6d24..780e9ce162 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -74,10 +74,11 @@ class Syr2Tester { using _ViewTypeY = Kokkos::View; using _ViewTypeA = Kokkos::View; - using _HostViewTypeX = typename _ViewTypeX::HostMirror; - using _HostViewTypeY = typename _ViewTypeY::HostMirror; - using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = Kokkos::View; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = + Kokkos::View; using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; From f8aa101bfa73735bf0f74c8ffd994dc1d2967475 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 23 Oct 2023 12:00:14 -0600 Subject: [PATCH 026/326] CUDA 11.2.1 / cuSPARSE 11.4.0 changed SpMV --- perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp | 4 +++- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 02fcd1640a..85aab62122 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -581,7 +581,9 @@ int main(int argc, char** argv) { const double alpha = 1.0, beta = 1.0; size_t bufferSize = 0; void* dBuffer = NULL; -#if CUSPARSE_VERSION >= 11201 + +// CUSPARSE_MM_ALG_DEFAULT was deprecated in CUDA 11.2.1 a.k.a cuSPARSE 11.4.0 +#if CUSPARSE_VERSION >= 11400 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; #else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 23d85c0b5c..6ac5f49296 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -65,6 +65,8 @@ void spmv_cusparse(const Kokkos::Cuda& exec, !Kokkos::ArithTraits::isComplex) myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE; +// Hopefully this corresponds to CUDA reelase 10.1, which is the first to +// include the "generic" API #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) using entry_type = typename AMatrix::non_const_ordinal_type; @@ -105,7 +107,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, size_t bufferSize = 0; void* dBuffer = NULL; -#if CUSPARSE_VERSION >= 11301 +#if CUSPARSE_VERSION >= 11400 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; #else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; @@ -113,13 +115,13 @@ void spmv_cusparse(const Kokkos::Cuda& exec, if (controls.isParameter("algorithm")) { const std::string algName = controls.getParameter("algorithm"); if (algName == "default") -#if CUSPARSE_VERSION >= 11301 +#if CUSPARSE_VERSION >= 11400 alg = CUSPARSE_SPMV_ALG_DEFAULT; #else alg = CUSPARSE_MV_ALG_DEFAULT; #endif else if (algName == "merge") -#if CUSPARSE_VERSION >= 11301 +#if CUSPARSE_VERSION >= 11400 alg = CUSPARSE_SPMV_CSR_ALG2; #else alg = CUSPARSE_CSRMV_ALG2; From 3f988435fae726a499afbc982106a4107f8d0d9a Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 23 Oct 2023 17:48:08 -0600 Subject: [PATCH 027/326] KokkosBlas1_axpby: include for debug builds Resolve compilation errors in debug mode: "error: no member named 'cout' in namespace 'std';" --- blas/src/KokkosBlas1_axpby.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index f371ab0b51..0c94c4009c 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -17,6 +17,10 @@ #ifndef KOKKOSBLAS1_AXPBY_HPP_ #define KOKKOSBLAS1_AXPBY_HPP_ +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#include +#endif + #include #include #include From 5b5c101956543ca4803584e4d27a329ee109b9fb Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 6 Sep 2023 20:30:14 -0600 Subject: [PATCH 028/326] Backup --- .../gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in | 0 .../trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in | 0 .../KokkosBlas_gesv_eti_spec_avail.hpp.in | 0 .../KokkosBlas_trtri_eti_spec_avail.hpp.in | 0 {blas => lapack}/impl/KokkosBlas_gesv_impl.hpp | 0 {blas => lapack}/impl/KokkosBlas_gesv_spec.hpp | 0 {blas => lapack}/impl/KokkosBlas_trtri_impl.hpp | 0 {blas => lapack}/impl/KokkosBlas_trtri_spec.hpp | 0 {blas => lapack}/src/KokkosBlas_gesv.hpp | 0 {blas => lapack}/src/KokkosBlas_trtri.hpp | 0 {blas => lapack}/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp | 0 {blas => lapack}/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp | 0 {blas => lapack}/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp | 0 {blas => lapack}/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp | 0 {blas => lapack}/unit_test/Test_Blas_gesv.hpp | 0 {blas => lapack}/unit_test/Test_Blas_trtri.hpp | 0 16 files changed, 0 insertions(+), 0 deletions(-) rename {blas => lapack}/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in (100%) rename {blas => lapack}/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in (100%) rename {blas => lapack}/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in (100%) rename {blas => lapack}/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in (100%) rename {blas => lapack}/impl/KokkosBlas_gesv_impl.hpp (100%) rename {blas => lapack}/impl/KokkosBlas_gesv_spec.hpp (100%) rename {blas => lapack}/impl/KokkosBlas_trtri_impl.hpp (100%) rename {blas => lapack}/impl/KokkosBlas_trtri_spec.hpp (100%) rename {blas => lapack}/src/KokkosBlas_gesv.hpp (100%) rename {blas => lapack}/src/KokkosBlas_trtri.hpp (100%) rename {blas => lapack}/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp (100%) rename {blas => lapack}/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp (100%) rename {blas => lapack}/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp (100%) rename {blas => lapack}/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp (100%) rename {blas => lapack}/unit_test/Test_Blas_gesv.hpp (100%) rename {blas => lapack}/unit_test/Test_Blas_trtri.hpp (100%) diff --git a/blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in similarity index 100% rename from blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in rename to lapack/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in diff --git a/blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in similarity index 100% rename from blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in rename to lapack/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in similarity index 100% rename from blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in rename to lapack/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in similarity index 100% rename from blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in rename to lapack/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in diff --git a/blas/impl/KokkosBlas_gesv_impl.hpp b/lapack/impl/KokkosBlas_gesv_impl.hpp similarity index 100% rename from blas/impl/KokkosBlas_gesv_impl.hpp rename to lapack/impl/KokkosBlas_gesv_impl.hpp diff --git a/blas/impl/KokkosBlas_gesv_spec.hpp b/lapack/impl/KokkosBlas_gesv_spec.hpp similarity index 100% rename from blas/impl/KokkosBlas_gesv_spec.hpp rename to lapack/impl/KokkosBlas_gesv_spec.hpp diff --git a/blas/impl/KokkosBlas_trtri_impl.hpp b/lapack/impl/KokkosBlas_trtri_impl.hpp similarity index 100% rename from blas/impl/KokkosBlas_trtri_impl.hpp rename to lapack/impl/KokkosBlas_trtri_impl.hpp diff --git a/blas/impl/KokkosBlas_trtri_spec.hpp b/lapack/impl/KokkosBlas_trtri_spec.hpp similarity index 100% rename from blas/impl/KokkosBlas_trtri_spec.hpp rename to lapack/impl/KokkosBlas_trtri_spec.hpp diff --git a/blas/src/KokkosBlas_gesv.hpp b/lapack/src/KokkosBlas_gesv.hpp similarity index 100% rename from blas/src/KokkosBlas_gesv.hpp rename to lapack/src/KokkosBlas_gesv.hpp diff --git a/blas/src/KokkosBlas_trtri.hpp b/lapack/src/KokkosBlas_trtri.hpp similarity index 100% rename from blas/src/KokkosBlas_trtri.hpp rename to lapack/src/KokkosBlas_trtri.hpp diff --git a/blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp similarity index 100% rename from blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp rename to lapack/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp diff --git a/blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp similarity index 100% rename from blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp rename to lapack/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp diff --git a/blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp similarity index 100% rename from blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp rename to lapack/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp diff --git a/blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp similarity index 100% rename from blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp rename to lapack/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp diff --git a/blas/unit_test/Test_Blas_gesv.hpp b/lapack/unit_test/Test_Blas_gesv.hpp similarity index 100% rename from blas/unit_test/Test_Blas_gesv.hpp rename to lapack/unit_test/Test_Blas_gesv.hpp diff --git a/blas/unit_test/Test_Blas_trtri.hpp b/lapack/unit_test/Test_Blas_trtri.hpp similarity index 100% rename from blas/unit_test/Test_Blas_trtri.hpp rename to lapack/unit_test/Test_Blas_trtri.hpp From 8f4914072cdeee50b39e92a6f1c8b596ffe66f53 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 6 Sep 2023 20:48:51 -0600 Subject: [PATCH 029/326] Backup --- CMakeLists.txt | 10 +++++- blas/tpls/KokkosBlas_Host_tpl.hpp | 6 ---- cmake/KokkosKernels_config.h.in | 2 ++ lapack/tpls/KokkosLapack_Host_tpl.hpp | 44 +++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 7 deletions(-) create mode 100644 lapack/tpls/KokkosLapack_Host_tpl.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 893e4239cd..812640374b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,7 @@ IF (KokkosKernels_INSTALL_TESTING) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(batched/dense/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(batched/sparse/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(ode/unit_test) @@ -192,7 +193,7 @@ ELSE() "ALL" STRING "A list of components to enable in testing and building" - VALID_ENTRIES BATCHED BLAS GRAPH SPARSE ALL + VALID_ENTRIES BATCHED BLAS LAPACK GRAPH SPARSE ALL ) # ================================================================== @@ -243,6 +244,7 @@ ELSE() MESSAGE(" COMMON: ON") MESSAGE(" BATCHED: ${KokkosKernels_ENABLE_COMPONENT_BATCHED}") MESSAGE(" BLAS: ${KokkosKernels_ENABLE_COMPONENT_BLAS}") + MESSAGE(" LAPACK: ${KokkosKernels_ENABLE_COMPONENT_LAPACK}") MESSAGE(" GRAPH: ${KokkosKernels_ENABLE_COMPONENT_GRAPH}") MESSAGE(" SPARSE: ${KokkosKernels_ENABLE_COMPONENT_SPARSE}") MESSAGE(" ODE: ${KokkosKernels_ENABLE_COMPONENT_ODE}") @@ -287,6 +289,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_BLAS) INCLUDE(blas/CMakeLists.txt) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) + INCLUDE(lapack/CMakeLists.txt) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) INCLUDE(graph/CMakeLists.txt) ENDIF() @@ -405,6 +410,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_BLAS) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) ENDIF() diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 3b0c7f366e..29afff4d62 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -115,12 +115,6 @@ struct HostBlas { const char diag, int m, int n, const T alpha, const T *a, int lda, /* */ T *b, int ldb); - - static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, - int info); - - static int trtri(const char uplo, const char diag, int n, const T *a, - int lda); }; } // namespace Impl } // namespace KokkosBlas diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index b8b66fffbb..621c78bfcc 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -109,6 +109,8 @@ /* BLAS library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_BLAS +/* LAPACK library */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL /* CUSPARSE */ diff --git a/lapack/tpls/KokkosLapack_Host_tpl.hpp b/lapack/tpls/KokkosLapack_Host_tpl.hpp new file mode 100644 index 0000000000..d74099aaec --- /dev/null +++ b/lapack/tpls/KokkosLapack_Host_tpl.hpp @@ -0,0 +1,44 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_HOST_TPL_HPP_ +#define KOKKOSLAPACK_HOST_TPL_HPP_ + +/// \file KokkosLapack_Host_tpl.hpp +/// \brief LAPACK wrapper + +#include "KokkosKernels_config.h" +#include "Kokkos_ArithTraits.hpp" + +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) + +namespace KokkosLapack { +namespace Impl { + +template +struct HostLapack { + static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, + int info); + + static int trtri(const char uplo, const char diag, int n, const T *a, + int lda); +}; +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK + +#endif // KOKKOSLAPACK_HOST_TPL_HPP_ From 845f7f2505503427325759496bf4f32f785eb6c9 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 6 Sep 2023 21:19:57 -0600 Subject: [PATCH 030/326] Backup --- ...=> KokkosLapack_gesv_eti_spec_inst.cpp.in} | 6 +- ...> KokkosLapack_trtri_eti_spec_inst.cpp.in} | 6 +- ...> KokkosLapack_gesv_eti_spec_avail.hpp.in} | 8 +- ... KokkosLapack_trtri_eti_spec_avail.hpp.in} | 12 +- ...sv_impl.hpp => KokkosLapack_gesv_impl.hpp} | 14 +- ...sv_spec.hpp => KokkosLapack_gesv_spec.hpp} | 38 +++--- ...i_impl.hpp => KokkosLapack_trtri_impl.hpp} | 12 +- ...i_spec.hpp => KokkosLapack_trtri_spec.hpp} | 32 ++--- ...kosBlas_gesv.hpp => KokkosLapack_gesv.hpp} | 52 ++++---- ...sBlas_trtri.hpp => KokkosLapack_trtri.hpp} | 22 ++-- ...p => KokkosLapack_gesv_tpl_spec_avail.hpp} | 48 +++---- ...pp => KokkosLapack_gesv_tpl_spec_decl.hpp} | 120 +++++++++--------- ... => KokkosLapack_trtri_tpl_spec_avail.hpp} | 76 +++++------ ...p => KokkosLapack_trtri_tpl_spec_decl.hpp} | 106 ++++++++-------- ...est_Blas_gesv.hpp => Test_Lapack_gesv.hpp} | 78 ++++++------ ...t_Blas_trtri.hpp => Test_Lapack_trtri.hpp} | 56 ++++---- 16 files changed, 344 insertions(+), 342 deletions(-) rename lapack/eti/generated_specializations_cpp/gesv/{KokkosBlas_gesv_eti_spec_inst.cpp.in => KokkosLapack_gesv_eti_spec_inst.cpp.in} (88%) rename lapack/eti/generated_specializations_cpp/trtri/{KokkosBlas_trtri_eti_spec_inst.cpp.in => KokkosLapack_trtri_eti_spec_inst.cpp.in} (88%) rename lapack/eti/generated_specializations_hpp/{KokkosBlas_gesv_eti_spec_avail.hpp.in => KokkosLapack_gesv_eti_spec_avail.hpp.in} (80%) rename lapack/eti/generated_specializations_hpp/{KokkosBlas_trtri_eti_spec_avail.hpp.in => KokkosLapack_trtri_eti_spec_avail.hpp.in} (73%) rename lapack/impl/{KokkosBlas_gesv_impl.hpp => KokkosLapack_gesv_impl.hpp} (73%) rename lapack/impl/{KokkosBlas_gesv_spec.hpp => KokkosLapack_gesv_spec.hpp} (83%) rename lapack/impl/{KokkosBlas_trtri_impl.hpp => KokkosLapack_trtri_impl.hpp} (91%) rename lapack/impl/{KokkosBlas_trtri_spec.hpp => KokkosLapack_trtri_spec.hpp} (83%) rename lapack/src/{KokkosBlas_gesv.hpp => KokkosLapack_gesv.hpp} (76%) rename lapack/src/{KokkosBlas_trtri.hpp => KokkosLapack_trtri.hpp} (88%) rename lapack/tpls/{KokkosBlas_gesv_tpl_spec_avail.hpp => KokkosLapack_gesv_tpl_spec_avail.hpp} (70%) rename lapack/tpls/{KokkosBlas_gesv_tpl_spec_decl.hpp => KokkosLapack_gesv_tpl_spec_decl.hpp} (89%) rename lapack/tpls/{KokkosBlas_trtri_tpl_spec_avail.hpp => KokkosLapack_trtri_tpl_spec_avail.hpp} (56%) rename lapack/tpls/{KokkosBlas_trtri_tpl_spec_decl.hpp => KokkosLapack_trtri_tpl_spec_decl.hpp} (73%) rename lapack/unit_test/{Test_Blas_gesv.hpp => Test_Lapack_gesv.hpp} (83%) rename lapack/unit_test/{Test_Blas_trtri.hpp => Test_Lapack_trtri.hpp} (88%) diff --git a/lapack/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in similarity index 88% rename from lapack/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in rename to lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in index 32473be3ad..da521984a4 100644 --- a/lapack/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in +++ b/lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in @@ -17,10 +17,10 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true #include "KokkosKernels_config.h" -#include "KokkosBlas_gesv_spec.hpp" +#include "KokkosLapack_gesv_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -@BLAS_GESV_ETI_INST_BLOCK@ +@LAPACK_GESV_ETI_INST_BLOCK@ } //IMPL } //Kokkos diff --git a/lapack/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in similarity index 88% rename from lapack/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in rename to lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in index 64755f7a54..c4ab12f5a4 100644 --- a/lapack/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in +++ b/lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in @@ -17,10 +17,10 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true #include "KokkosKernels_config.h" -#include "KokkosBlas_trtri_spec.hpp" +#include "KokkosLapack_trtri_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -@BLAS_TRTRI_ETI_INST_BLOCK@ +@LAPACK_TRTRI_ETI_INST_BLOCK@ } //IMPL } //Kokkos diff --git a/lapack/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in similarity index 80% rename from lapack/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in rename to lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in index ae262c912e..d1f36e3069 100644 --- a/lapack/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { namespace Impl { -@BLAS_GESV_ETI_AVAIL_BLOCK@ +@LAPACK_GESV_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos #endif diff --git a/lapack/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in similarity index 73% rename from lapack/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in rename to lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in index 3f669efa06..89443c2c9b 100644 --- a/lapack/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in @@ -14,13 +14,13 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { namespace Impl { -@BLAS_TRTRI_ETI_AVAIL_BLOCK@ +@LAPACK_TRTRI_ETI_AVAIL_BLOCK@ } // Impl -} // KokkosBlas -#endif // KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ +} // KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ diff --git a/lapack/impl/KokkosBlas_gesv_impl.hpp b/lapack/impl/KokkosLapack_gesv_impl.hpp similarity index 73% rename from lapack/impl/KokkosBlas_gesv_impl.hpp rename to lapack/impl/KokkosLapack_gesv_impl.hpp index e51e48309f..3a60f42171 100644 --- a/lapack/impl/KokkosBlas_gesv_impl.hpp +++ b/lapack/impl/KokkosLapack_gesv_impl.hpp @@ -14,21 +14,21 @@ // //@HEADER -#ifndef KOKKOSBLAS_IMPL_GESV_HPP_ -#define KOKKOSBLAS_IMPL_GESV_HPP_ +#ifndef KOKKOSLAPACK_IMPL_GESV_HPP_ +#define KOKKOSLAPACK_IMPL_GESV_HPP_ -/// \file KokkosBlas_gesv_impl.hpp +/// \file KokkosLapack_gesv_impl.hpp /// \brief Implementation(s) of dense linear solve. #include #include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -// NOTE: Might add the implementation of KokkosBlas::gesv later +// NOTE: Might add the implementation of KokkosLapack::gesv later } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack -#endif // KOKKOSBLAS_IMPL_GESV_HPP +#endif // KOKKOSLAPACK_IMPL_GESV_HPP diff --git a/lapack/impl/KokkosBlas_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp similarity index 83% rename from lapack/impl/KokkosBlas_gesv_spec.hpp rename to lapack/impl/KokkosLapack_gesv_spec.hpp index f1dff467c8..8ea1df03bf 100644 --- a/lapack/impl/KokkosBlas_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ -#define KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ +#ifndef KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ +#define KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ #include #include @@ -22,10 +22,10 @@ // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -#include +#include #endif -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -33,16 +33,16 @@ struct gesv_eti_spec_avail { enum : bool { value = false }; }; } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // Macro for declaration of full specialization availability -// KokkosBlas::Impl::GESV. This is NOT for users!!! All +// KokkosLapack::Impl::GESV. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template <> \ struct gesv_eti_spec_avail< \ @@ -56,14 +56,14 @@ struct gesv_eti_spec_avail { }; // Include the actual specialization declarations -#include -#include +#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Unification layer -/// \brief Implementation of KokkosBlas::gesv. +/// \brief Implementation of KokkosLapack::gesv. template ::value, @@ -79,25 +79,25 @@ template struct GESV { static void gesv(const AMatrix & /* A */, const BXMV & /* B */, const IPIVV & /* IPIV */) { - // NOTE: Might add the implementation of KokkosBlas::gesv later + // NOTE: Might add the implementation of KokkosLapack::gesv later throw std::runtime_error( "No fallback implementation of GESV (general LU factorization & solve) " - "exists. Enable BLAS and/or MAGMA TPL."); + "exists. Enable LAPACK and/or MAGMA TPL."); } }; #endif } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // Macro for declaration of full specialization of -// KokkosBlas::Impl::GESV. This is NOT for users!!! All +// KokkosLapack::Impl::GESV. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ extern template struct GESV< \ Kokkos::View { Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GESV< \ Kokkos::View { Kokkos::MemoryTraits >, \ false, true>; -#include +#include -#endif // KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ +#endif // KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ diff --git a/lapack/impl/KokkosBlas_trtri_impl.hpp b/lapack/impl/KokkosLapack_trtri_impl.hpp similarity index 91% rename from lapack/impl/KokkosBlas_trtri_impl.hpp rename to lapack/impl/KokkosLapack_trtri_impl.hpp index 4501763ea8..9f52c2d412 100644 --- a/lapack/impl/KokkosBlas_trtri_impl.hpp +++ b/lapack/impl/KokkosLapack_trtri_impl.hpp @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_IMPL_HPP_ -#define KOKKOSBLAS_TRTRI_IMPL_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_IMPL_HPP_ +#define KOKKOSLAPACK_TRTRI_IMPL_HPP_ /** - * \file KokkosBlas_trtri_impl.hpp + * \file KokkosLapack_trtri_impl.hpp * \brief Implementation of triangular matrix inverse */ @@ -27,7 +27,7 @@ #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { template @@ -65,5 +65,5 @@ void SerialTrtri_Invoke(const RViewType &R, const char uplo[], } } } // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSBLAS_TRTRI_IMPL_HPP_ +} // namespace KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_IMPL_HPP_ diff --git a/lapack/impl/KokkosBlas_trtri_spec.hpp b/lapack/impl/KokkosLapack_trtri_spec.hpp similarity index 83% rename from lapack/impl/KokkosBlas_trtri_spec.hpp rename to lapack/impl/KokkosLapack_trtri_spec.hpp index 2a4d2db576..e48b37f7c2 100644 --- a/lapack/impl/KokkosBlas_trtri_spec.hpp +++ b/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -13,17 +13,17 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_SPEC_HPP_ -#define KOKKOSBLAS_TRTRI_SPEC_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_SPEC_HPP_ +#define KOKKOSLAPACK_TRTRI_SPEC_HPP_ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -#include +#include #endif -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -31,13 +31,13 @@ struct trtri_eti_spec_avail { enum : bool { value = false }; }; } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // This Macros provides the ETI specialization of trtri, currently not // available. // -#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ MEM_SPACE) \ template <> \ struct trtri_eti_spec_avail< \ @@ -49,10 +49,10 @@ struct trtri_eti_spec_avail { }; // Include the actual specialization declarations -#include -#include +#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // @@ -77,8 +77,8 @@ struct TRTRI { static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::trtri[ETI]" - : "KokkosBlas::trtri[noETI]"); + ? "KokkosLapack::trtri[ETI]" + : "KokkosLapack::trtri[noETI]"); typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A); typename RVIT::HostMirror host_R = Kokkos::create_mirror_view(R); @@ -97,7 +97,7 @@ struct TRTRI { //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // These Macros are only included when we are not compiling libkokkoskernels but @@ -106,7 +106,7 @@ struct TRTRI { // "extern template" skips the implicit instatiation step ensuring that the // callers code uses this explicit instantiation definition of TRTRI. // -#define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ extern template struct TRTRI< \ Kokkos::View >, \ @@ -114,7 +114,7 @@ struct TRTRI { Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ template struct TRTRI< \ Kokkos::View >, \ @@ -122,6 +122,6 @@ struct TRTRI { Kokkos::MemoryTraits >, \ false, true>; -#include +#include -#endif // KOKKOSBLAS_TRTRI_SPEC_HPP_ +#endif // KOKKOSLAPACK_TRTRI_SPEC_HPP_ diff --git a/lapack/src/KokkosBlas_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp similarity index 76% rename from lapack/src/KokkosBlas_gesv.hpp rename to lapack/src/KokkosLapack_gesv.hpp index 89b9d36c96..b08f523f6e 100644 --- a/lapack/src/KokkosBlas_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -14,23 +14,23 @@ // //@HEADER -/// \file KokkosBlas_gesv.hpp +/// \file KokkosLapack_gesv.hpp /// \brief Local dense linear solve /// -/// This file provides KokkosBlas::gesv. This function performs a +/// This file provides KokkosLapack::gesv. This function performs a /// local (no MPI) dense linear solve on a system of linear equations /// A * X = B where A is a general N-by-N matrix and X and B are N-by-NRHS /// matrices. -#ifndef KOKKOSBLAS_GESV_HPP_ -#define KOKKOSBLAS_GESV_HPP_ +#ifndef KOKKOSLAPACK_GESV_HPP_ +#define KOKKOSLAPACK_GESV_HPP_ #include -#include "KokkosBlas_gesv_spec.hpp" +#include "KokkosLapack_gesv_spec.hpp" #include "KokkosKernels_Error.hpp" -namespace KokkosBlas { +namespace KokkosLapack { /// \brief Solve the dense linear equation system A*X = B. /// @@ -50,24 +50,24 @@ namespace KokkosBlas { /// template void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { - // NOTE: Currently, KokkosBlas::gesv only supports for MAGMA TPL and BLAS TPL. + // NOTE: Currently, KokkosLapack::gesv only supports for MAGMA TPL and LAPACK TPL. // MAGMA TPL should be enabled to call the MAGMA GPU interface for - // device views BLAS TPL should be enabled to call the BLAS interface + // device views LAPACK TPL should be enabled to call the LAPACK interface // for host views static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: A must be a Kokkos::View."); + "KokkosLapack::gesv: A must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: B must be a Kokkos::View."); + "KokkosLapack::gesv: B must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: IPIV must be a Kokkos::View."); + "KokkosLapack::gesv: IPIV must be a Kokkos::View."); static_assert(static_cast(AMatrix::rank) == 2, - "KokkosBlas::gesv: A must have rank 2."); + "KokkosLapack::gesv: A must have rank 2."); static_assert( static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, - "KokkosBlas::gesv: B must have either rank 1 or rank 2."); + "KokkosLapack::gesv: B must have either rank 1 or rank 2."); static_assert(static_cast(IPIVV::rank) == 1, - "KokkosBlas::gesv: IPIV must have rank 1."); + "KokkosLapack::gesv: IPIV must have rank 1."); int64_t IPIV0 = IPIV.extent(0); int64_t A0 = A.extent(0); @@ -79,7 +79,7 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); if (!(valid_pivot)) { std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " << "Valid options include zero-extent 1-D view (no pivoting), or 1-D " "View with size of " << A0 << " (partial pivoting)."; @@ -88,22 +88,22 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { // Check for no pivoting case. Only MAGMA supports no pivoting interface #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL if ((!std::is_same::value) && (IPIV0 == 0) && (IPIV.data() == nullptr)) { std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "BLAS TPL does not support no pivoting."; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; KokkosKernels::Impl::throw_runtime_exception(os.str()); } #endif #else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL if ((IPIV0 == 0) && (IPIV.data() == nullptr)) { std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "BLAS TPL does not support no pivoting."; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; KokkosKernels::Impl::throw_runtime_exception(os.str()); } #endif @@ -112,7 +112,7 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { // Check compatibility of dimensions at run time. if ((A0 < A1) || (A0 != B0)) { std::ostringstream os; - os << "KokkosBlas::gesv: Dimensions of A, and B do not match: " + os << "KokkosLapack::gesv: Dimensions of A, and B do not match: " << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) << " x " << B.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); @@ -136,15 +136,15 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { if (BXMV::rank == 1) { auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); - KokkosBlas::Impl::GESV::gesv(A_i, B_i, IPIV_i); } else { // BXMV::rank == 2 auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); - KokkosBlas::Impl::GESV::gesv(A_i, B_i, IPIV_i); } } -} // namespace KokkosBlas +} // namespace KokkosLapack -#endif // KOKKOSBLAS_GESV_HPP_ +#endif // KOKKOSLAPACK_GESV_HPP_ diff --git a/lapack/src/KokkosBlas_trtri.hpp b/lapack/src/KokkosLapack_trtri.hpp similarity index 88% rename from lapack/src/KokkosBlas_trtri.hpp rename to lapack/src/KokkosLapack_trtri.hpp index b1a34f0483..44e8fc9f65 100644 --- a/lapack/src/KokkosBlas_trtri.hpp +++ b/lapack/src/KokkosLapack_trtri.hpp @@ -13,19 +13,19 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_HPP_ -#define KOKKOSBLAS_TRTRI_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_HPP_ +#define KOKKOSLAPACK_TRTRI_HPP_ -/// \file KokkosBlas_trtri.hpp +/// \file KokkosLapack_trtri.hpp #include "KokkosKernels_Macros.hpp" -#include "KokkosBlas_trtri_spec.hpp" +#include "KokkosLapack_trtri_spec.hpp" #include "KokkosKernels_helpers.hpp" #include #include #include "KokkosKernels_Error.hpp" -namespace KokkosBlas { +namespace KokkosLapack { /// \brief Find the inverse of the triangular matrix, A /// @@ -62,14 +62,14 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { if (!valid_uplo) { std::ostringstream os; - os << "KokkosBlas::trtri: uplo = '" << uplo[0] << "'. " + os << "KokkosLapack::trtri: uplo = '" << uplo[0] << "'. " << "Valid values include 'U' or 'u' (A is upper triangular), " "'L' or 'l' (A is lower triangular)."; KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (!valid_diag) { std::ostringstream os; - os << "KokkosBlas::trtri: diag = '" << diag[0] << "'. " + os << "KokkosLapack::trtri: diag = '" << diag[0] << "'. " << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be " "unit), " "'N' or 'n' (the diagonal of A is assumed to be non-unit)."; @@ -88,7 +88,7 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { // or B*A if (A_m != A_n) { std::ostringstream os; - os << "KokkosBlas::trtri: Dimensions of A do not match," + os << "KokkosLapack::trtri: Dimensions of A do not match," << " A: " << A.extent(0) << " x " << A.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -108,12 +108,12 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { int result; RViewInternalType R = RViewInternalType(&result); - KokkosBlas::Impl::TRTRI::trtri(R, uplo, + KokkosLapack::Impl::TRTRI::trtri(R, uplo, diag, A); return result; } -} // namespace KokkosBlas +} // namespace KokkosLapack -#endif // KOKKOS_BLASLAPACK_TRTRI_HPP_ +#endif // KOKKOSLAPACK_TRTRI_HPP_ diff --git a/lapack/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp similarity index 70% rename from lapack/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp rename to lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index f909b4a295..74a65d4cf9 100644 --- a/lapack/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -14,10 +14,10 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_HPP_ +#ifndef KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -25,10 +25,10 @@ struct gesv_tpl_spec_avail { enum : bool { value = false }; }; -// Generic Host side BLAS (could be MKL or whatever) -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +// Generic Host side LAPACK (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -38,30 +38,30 @@ struct gesv_tpl_spec_avail { enum : bool { value = true }; \ }; -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( double, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_FLOAT) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( float, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #endif */ #endif @@ -69,7 +69,7 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -79,36 +79,36 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif enum : bool { value = true }; \ }; -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_FLOAT) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex,Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif */ #endif } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack #endif diff --git a/lapack/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp similarity index 89% rename from lapack/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp rename to lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 7d8f0a8a2b..dcab48f07b 100644 --- a/lapack/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -14,21 +14,21 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_TPL_SPEC_DECL_HPP_ -#define KOKKOSBLAS_GESV_TPL_SPEC_DECL_HPP_ +#ifndef KOKKOSLAPACK_GESV_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_GESV_TPL_SPEC_DECL_HPP_ -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { template inline void gesv_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - printf("KokkosBlas::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", + printf("KokkosLapack::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", typeid(AViewType).name(), typeid(BViewType).name(), typeid(PViewType).name()); #else -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - printf("KokkosBlas::gesv<> TPL Blas specialization for < %s , %s, %s >\n", +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK + printf("KokkosLapack::gesv<> TPL Lapack specialization for < %s , %s, %s >\n", typeid(AViewType).name(), typeid(BViewType).name(), typeid(PViewType).name()); #endif @@ -36,16 +36,16 @@ inline void gesv_print_specialization() { #endif } } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack -// Generic Host side BLAS (could be MKL or whatever) -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#include +// Generic Host side LAPACK (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#define KOKKOSBLAS_DGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -74,7 +74,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -89,14 +89,14 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ LDB, info); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS_SGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_SGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -125,7 +125,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,float]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,float]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -140,14 +140,14 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ LDB, info); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS_ZGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -178,7 +178,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_BLAS,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -193,7 +193,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -202,7 +202,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_CGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -233,7 +233,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_BLAS,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -248,7 +248,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -257,30 +257,30 @@ namespace Impl { } \ }; -KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#define KOKKOSBLAS_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -309,7 +309,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -321,8 +321,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -339,7 +339,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -368,7 +368,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,float]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -380,8 +380,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -398,7 +398,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -429,7 +429,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -441,8 +441,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -459,7 +459,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -490,7 +490,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -502,8 +502,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -520,20 +520,20 @@ namespace Impl { } \ }; -KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA #endif diff --git a/lapack/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp similarity index 56% rename from lapack/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp rename to lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp index de9fc08c99..d723cef260 100644 --- a/lapack/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp @@ -14,10 +14,10 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -27,7 +27,7 @@ struct trtri_tpl_spec_avail { }; // Generic Host side LAPACK (could be MKL or whatever) -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ template \ struct trtri_tpl_spec_avail< \ Kokkos::View, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack -#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ +#endif // KOKKOSLAPACKy_TRTRI_TPL_SPEC_AVAIL_HPP_ diff --git a/lapack/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp similarity index 73% rename from lapack/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp rename to lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 46ec894547..9f79ad2eb5 100644 --- a/lapack/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -14,17 +14,17 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ -#define KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ -#include "KokkosBlas_Host_tpl.hpp" // trtri prototype -#include "KokkosBlas_tpl_spec.hpp" +#include "KokkosLapack_Host_tpl.hpp" // trtri prototype +#include "KokkosLapack_tpl_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRTRI(A.extent(0)); \ \ @@ -61,19 +61,19 @@ namespace Impl { else \ uplo_ = A_is_layout_left ? 'U' : 'L'; \ \ - R() = HostBlas::trtri( \ + R() = HostLapack::trtri( \ uplo_, diag[0], M, \ reinterpret_cast(A.data()), LDA); \ Kokkos::Profiling::popRegion(); \ } \ }; #else -#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ MEM_SPACE, ETI_SPEC_AVAIL) -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRTRI(A.extent(0)); \ \ @@ -116,8 +116,8 @@ namespace Impl { else \ diag_ = MagmaNonUnit; \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ R() = MAGMA_FN(uplo_, diag_, M, \ reinterpret_cast( \ const_cast(A.data())), \ @@ -126,71 +126,71 @@ namespace Impl { } \ }; #else -#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA // Explicitly define the TRTRI class for all permutations listed below // Handle type and space permutations -#define KOKKOSBLAS_DTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) -#define KOKKOSBLAS_STRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) -#define KOKKOSBLAS_ZTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex, std::complex, \ +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS_CTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex, std::complex, \ +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ ETI_SPEC_AVAIL) // Handle layout permutations -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, false) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutRight, false) } // namespace Impl -} // nameSpace KokkosBlas +} // nameSpace KokkosLapack -#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ +#endif // KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ diff --git a/lapack/unit_test/Test_Blas_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp similarity index 83% rename from lapack/unit_test/Test_Blas_gesv.hpp rename to lapack/unit_test/Test_Lapack_gesv.hpp index 57ee6373bf..25d5089a58 100644 --- a/lapack/unit_test/Test_Blas_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -14,19 +14,19 @@ // //@HEADER -// only enable this test where KokkosBlas supports gesv: -// CUDA+MAGMA and HOST+BLAS -#if (defined(TEST_CUDA_BLAS_CPP) && \ +// only enable this test where KokkosLapack supports gesv: +// CUDA+MAGMA and HOST+LAPACK +#if (defined(TEST_CUDA_LAPACK_CPP) && \ defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ - (defined(TEST_OPENMP_BLAS_CPP) || defined(TEST_OPENMPTARGET_BLAS_CPP) || \ - defined(TEST_SERIAL_BLAS_CPP) || defined(TEST_THREADS_BLAS_CPP))) + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ + defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include #include #include -#include +#include #include #include #include @@ -89,7 +89,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Solve. try { - KokkosBlas::gesv(A, B, ipiv); + KokkosLapack::gesv(A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) @@ -97,7 +97,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { bool nopivot_runtime_err = false; bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -106,7 +106,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { notpl_runtime_err = true; #endif #else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -194,7 +194,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, // Solve. try { - KokkosBlas::gesv(A, B, ipiv); + KokkosLapack::gesv(A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) @@ -202,7 +202,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, bool nopivot_runtime_err = false; bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -211,7 +211,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, notpl_runtime_err = true; #endif #else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -342,16 +342,16 @@ int test_gesv_mrhs(const char* mode) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_float"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_float"); + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_float"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_float"); + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -360,16 +360,16 @@ TEST_F(TestCategory, gesv_mrhs_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_double"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_double"); + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_double"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_double"); + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -378,16 +378,17 @@ TEST_F(TestCategory, gesv_mrhs_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_double"); - test_gesv, TestDevice>("N"); // No pivoting - test_gesv, TestDevice>("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_double"); + test_gesv, TestExecSpace>("N"); // No pivoting + test_gesv, TestExecSpace>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_double"); - test_gesv_mrhs, TestDevice>("N"); // No pivoting - test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_double"); + test_gesv_mrhs, TestExecSpace>("N"); // No pivoting + test_gesv_mrhs, TestExecSpace>( + "Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -396,18 +397,19 @@ TEST_F(TestCategory, gesv_mrhs_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_float"); - test_gesv, TestDevice>("N"); // No pivoting - test_gesv, TestDevice>("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_float"); + test_gesv, TestExecSpace>("N"); // No pivoting + test_gesv, TestExecSpace>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_float"); - test_gesv_mrhs, TestDevice>("N"); // No pivoting - test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_float"); + test_gesv_mrhs, TestExecSpace>("N"); // No pivoting + test_gesv_mrhs, TestExecSpace>( + "Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif -#endif // CUDA+MAGMA or BLAS+HOST +#endif // CUDA+MAGMA or LAPACK+HOST diff --git a/lapack/unit_test/Test_Blas_trtri.hpp b/lapack/unit_test/Test_Lapack_trtri.hpp similarity index 88% rename from lapack/unit_test/Test_Blas_trtri.hpp rename to lapack/unit_test/Test_Lapack_trtri.hpp index aa12fa959b..498b1248f3 100644 --- a/lapack/unit_test/Test_Blas_trtri.hpp +++ b/lapack/unit_test/Test_Lapack_trtri.hpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -118,7 +118,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, // const int As0 = A.stride(0), As1 = A.stride(1); // const int Ae0 = A.extent(0), Ae1 = A.extent(1); - // printf("KokkosBlas::trtri test for %c %c, M %d, N %d, eps %g, ViewType: %s, + // printf("KokkosLapack::trtri test for %c %c, M %d, N %d, eps %g, ViewType: %s, // A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d // START\n", uplo[0],diag[0],M,N,eps,typeid(ViewTypeA).name(), As0, As1, Ae0, // Ae1); fflush(stdout); @@ -141,7 +141,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0); Kokkos::deep_copy(A, host_A); } - return KokkosBlas::trtri(uplo, diag, A); + return KokkosLapack::trtri(uplo, diag, A); } // If M is greater than 100 and A is an unit triangluar matrix, make A the @@ -158,13 +158,13 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, using functor_type = UnitDiagTRTRI; functor_type udtrtri(A); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", + Kokkos::parallel_for("KokkosLapack::Test::UnitDiagTRTRI", Kokkos::RangePolicy(0, M), udtrtri); } else { //(diag[0]=='N')||(diag[0]=='n') using functor_type = NonUnitDiagTRTRI; functor_type nudtrtri(A); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", + Kokkos::parallel_for("KokkosLapack::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy(0, M), nudtrtri); } Kokkos::fence(); @@ -195,11 +195,11 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, #endif // A = A^-1 - ret = KokkosBlas::trtri(uplo, diag, A); + ret = KokkosLapack::trtri(uplo, diag, A); Kokkos::fence(); if (ret) { - printf("KokkosBlas::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], + printf("KokkosLapack::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], typeid(ViewTypeA).name(), ret); return ret; } @@ -229,7 +229,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, vgemm.alpha = ScalarA(1); vgemm.beta = beta; Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", + "KokkosLapack::Test::VanillaGEMM", Kokkos::TeamPolicy( M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), @@ -362,11 +362,11 @@ int test_trtri(const char* mode) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_float"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_float"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -375,11 +375,11 @@ TEST_F(TestCategory, trtri_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_double"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_double"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -388,11 +388,11 @@ TEST_F(TestCategory, trtri_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_double"); - test_trtri, TestDevice>("UN"); - test_trtri, TestDevice>("UU"); - test_trtri, TestDevice>("LN"); - test_trtri, TestDevice>("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_double"); + test_trtri, TestExecSpace>("UN"); + test_trtri, TestExecSpace>("UU"); + test_trtri, TestExecSpace>("LN"); + test_trtri, TestExecSpace>("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -401,11 +401,11 @@ TEST_F(TestCategory, trtri_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_float"); - test_trtri, TestDevice>("UN"); - test_trtri, TestDevice>("UU"); - test_trtri, TestDevice>("LN"); - test_trtri, TestDevice>("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_float"); + test_trtri, TestExecSpace>("UN"); + test_trtri, TestExecSpace>("UU"); + test_trtri, TestExecSpace>("LN"); + test_trtri, TestExecSpace>("LU"); Kokkos::Profiling::popRegion(); } #endif From 05afd000f0b76eac0af6143b7d1150329482d484 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 6 Sep 2023 22:10:12 -0600 Subject: [PATCH 031/326] Backup --- blas/CMakeLists.txt | 14 --- lapack/CMakeLists.txt | 67 ++++++++++++ lapack/tpls/KokkosLapack_Host_tpl.cpp | 151 ++++++++++++++++++++++++++ 3 files changed, 218 insertions(+), 14 deletions(-) create mode 100644 lapack/CMakeLists.txt create mode 100644 lapack/tpls/KokkosLapack_Host_tpl.cpp diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 816d68e443..5bc7217cfd 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -101,13 +101,6 @@ KOKKOSKERNELS_GENERATE_ETI(Blas1_dot_mv dot TYPE_LISTS FLOATS LAYOUTS DEVICES ) -KOKKOSKERNELS_GENERATE_ETI(Blas_gesv gesv - COMPONENTS blas - HEADER_LIST ETI_HEADERS - SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES -) - KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby axpby COMPONENTS blas HEADER_LIST ETI_HEADERS @@ -331,10 +324,3 @@ KOKKOSKERNELS_GENERATE_ETI(Blas3_trmm trmm SOURCE_LIST SOURCES TYPE_LISTS FLOATS LAYOUTS DEVICES ) - -KOKKOSKERNELS_GENERATE_ETI(Blas_trtri trtri - COMPONENTS blas - HEADER_LIST ETI_HEADERS - SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES -) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt new file mode 100644 index 0000000000..0f38d0aa50 --- /dev/null +++ b/lapack/CMakeLists.txt @@ -0,0 +1,67 @@ +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/impl) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/eti) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/lapack/eti) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/tpls) + +# Adding unit-tests +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/lapack) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/lapack) + +####################### +# # +# Logic for LAPACK TPLs # +# # +####################### + +#Include LAPACK, Lapack host wrapper +IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) + #Do NOT add this to include path + APPEND_GLOB(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/tpls/KokkosLapack_Host_tpl.cpp) +ENDIF() + +# Include host lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Host_tpl.cpp + ) +ENDIF() + +# Include cuda lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_CULAPACK) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Cuda_tpl.cpp + ) +ENDIF() + +# Include rocm lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_ROCLAPACK) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Rocm_tpl.cpp + ) +ENDIF() + +################## +# # +# ETI generation # +# # +################## + +#Build up a list of DECL, AVAIL, and INST macros +#that should be instantiated based on input options +#Generate @X@ variables in the template X.hpp.in and X.cpp.in +#files containing the list of all needed macros + +KOKKOSKERNELS_GENERATE_ETI(Lapack_gesv gesv + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Lapack_trtri trtri + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp new file mode 100644 index 0000000000..8e7158528e --- /dev/null +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -0,0 +1,151 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \file KokkosLapack_Host_tpl.cpp +/// \brief LAPACK wrapper for host tpls +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosKernels_config.h" +#include "KokkosLapack_Host_tpl.hpp" + +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) + +/// Fortran headers +extern "C" { + +/// +/// Gesv +/// + +void F77_LAPACK_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, + int*); +void F77_LAPACK_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, + int*, int*); +void F77_LAPACK_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, + std::complex*, int*, int*); +void F77_LAPACK_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, + int*, std::complex*, int*, int*); + +/// +/// Trtri +/// +/* + HostLapack::trtri(const char uplo, const char diag, + int n, const float *a, int lda) { + int info = 0; + F77_FUNC_STRTRI(&uplo, + &diag, &n, + a, &lda, &info); +*/ +void F77_LAPACK_MANGLE(strtri, STRTRI)(const char*, const char*, int*, + const float*, int*, int*); +void F77_LAPACK_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, + const double*, int*, int*); +void F77_LAPACK_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, + const std::complex*, int*, int*); +void F77_LAPACK_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, + const std::complex*, int*, int*); +} + +#define F77_FUNC_SGESV F77_LAPACK_MANGLE(sgesv, SGESV) +#define F77_FUNC_DGESV F77_LAPACK_MANGLE(dgesv, DGESV) +#define F77_FUNC_CGESV F77_LAPACK_MANGLE(cgesv, CGESV) +#define F77_FUNC_ZGESV F77_LAPACK_MANGLE(zgesv, ZGESV) + +#define F77_FUNC_STRTRI F77_LAPACK_MANGLE(strtri, STRTRI) +#define F77_FUNC_DTRTRI F77_LAPACK_MANGLE(dtrtri, DTRTRI) +#define F77_FUNC_CTRTRI F77_LAPACK_MANGLE(ctrtri, CTRTRI) +#define F77_FUNC_ZTRTRI F77_LAPACK_MANGLE(ztrtri, ZTRTRI) + +namespace KokkosLapack { +namespace Impl { + +/// +/// float +/// + +template <> +void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, + float* b, int ldb, int info) { + F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack::trtri(const char uplo, const char diag, int n, + const float* a, int lda) { + int info = 0; + F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// double +/// + +template <> +void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, + double* b, int ldb, int info) { + F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack::trtri(const char uplo, const char diag, int n, + const double* a, int lda) { + int info = 0; + F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// std::complex +/// + +template <> +void HostLapack >::gesv(int n, int rhs, + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { + F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack >::trtri(const char uplo, const char diag, + int n, const std::complex* a, + int lda) { + int info = 0; + F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// std::complex +/// + +template <> +void HostLapack >::gesv(int n, int rhs, + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { + F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack >::trtri(const char uplo, const char diag, + int n, const std::complex* a, + int lda) { + int info = 0; + F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK From e455f377431168d213b023e703d8111e6cbd765d Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 6 Sep 2023 22:25:25 -0600 Subject: [PATCH 032/326] Backup --- blas/tpls/KokkosBlas_Host_tpl.cpp | 96 ------------------------------- blas/unit_test/Test_Blas.hpp | 3 - lapack/unit_test/CMakeLists.txt | 94 ++++++++++++++++++++++++++++++ lapack/unit_test/Test_Lapack.hpp | 22 +++++++ 4 files changed, 116 insertions(+), 99 deletions(-) create mode 100644 lapack/unit_test/CMakeLists.txt create mode 100644 lapack/unit_test/Test_Lapack.hpp diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index a7be0d31ab..88c3ef7bbd 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -412,38 +412,6 @@ void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, const std::complex*, int*, /* */ std::complex*, int*); -/// -/// Gesv -/// - -void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, - int*); -void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, - int*, int*); -void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, - std::complex*, int*, int*); -void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, - int*, std::complex*, int*, int*); - -/// -/// Trtri -/// -/* - HostBlas::trtri(const char uplo, const char diag, - int n, const float *a, int lda) { - int info = 0; - F77_FUNC_STRTRI(&uplo, - &diag, &n, - a, &lda, &info); -*/ -void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, - const float*, int*, int*); -void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, - const double*, int*, int*); -void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); -void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); } void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, @@ -559,16 +527,6 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CTRSM F77_BLAS_MANGLE(ctrsm, CTRSM) #define F77_FUNC_ZTRSM F77_BLAS_MANGLE(ztrsm, ZTRSM) -#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) -#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv, DGESV) -#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) -#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) - -#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) -#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) -#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) -#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri, ZTRTRI) - namespace KokkosBlas { namespace Impl { @@ -688,18 +646,6 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } -template <> -void HostBlas::gesv(int n, int rhs, float* a, int lda, int* ipiv, - float* b, int ldb, int info) { - F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas::trtri(const char uplo, const char diag, int n, - const float* a, int lda) { - int info = 0; - F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// double @@ -818,18 +764,6 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } -template <> -void HostBlas::gesv(int n, int rhs, double* a, int lda, int* ipiv, - double* b, int ldb, int info) { - F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas::trtri(const char uplo, const char diag, int n, - const double* a, int lda) { - int info = 0; - F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// std::complex @@ -1000,21 +934,6 @@ void HostBlas >::trsm(const char side, const char uplo, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } -template <> -void HostBlas >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { - F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { - int info = 0; - F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// std::complex @@ -1183,21 +1102,6 @@ void HostBlas >::trsm( (const std::complex*)a, &lda, (std::complex*)b, &ldb); } -template <> -void HostBlas >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { - F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { - int info = 0; - F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} } // namespace Impl } // namespace KokkosBlas diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index b370436391..9bb37d8d95 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -16,9 +16,6 @@ #ifndef TEST_BLAS_HPP #define TEST_BLAS_HPP -#include "Test_Blas_gesv.hpp" -#include "Test_Blas_trtri.hpp" - // Blas 1 #include "Test_Blas1_abs.hpp" #include "Test_Blas1_asum.hpp" diff --git a/lapack/unit_test/CMakeLists.txt b/lapack/unit_test/CMakeLists.txt new file mode 100644 index 0000000000..b0ccaf8e7e --- /dev/null +++ b/lapack/unit_test/CMakeLists.txt @@ -0,0 +1,94 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/test_common) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${PACKAGE_SOURCE_DIR}/test_common) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + +##################### +# # +# Define unit-tests # +# # +##################### + +##################### +# # +# Add GPU backends # +# # +##################### +IF (KOKKOS_ENABLE_CUDA) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_cuda + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Cuda_Blas.cpp + COMPONENTS blas + ) +ENDIF () + +IF (KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_hip + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_HIP_Blas.cpp + COMPONENTS blas + ) +ENDIF () + +IF (KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_sycl + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_SYCL_Blas.cpp + COMPONENTS blas + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMPTARGET) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # blas_openmptarget + # SOURCES + # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + # backends/Test_OpenMPTarget_Blas.cpp + # COMPONENTS blas + # ) +ENDIF () + + + +##################### +# # +# Add CPU backends # +# # +##################### +IF (KOKKOS_ENABLE_SERIAL) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_serial + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Serial_Blas.cpp + COMPONENTS blas + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMP) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_openmp + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_OpenMP_Blas.cpp + COMPONENTS blas + ) +ENDIF () + +IF (KOKKOS_ENABLE_THREADS) + KOKKOSKERNELS_ADD_UNIT_TEST( + blas_threads + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Threads_Blas.cpp + COMPONENTS blas + ) +ENDIF () + diff --git a/lapack/unit_test/Test_Lapack.hpp b/lapack/unit_test/Test_Lapack.hpp new file mode 100644 index 0000000000..815c442884 --- /dev/null +++ b/lapack/unit_test/Test_Lapack.hpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_LAPACK_HPP +#define TEST_LAPACK_HPP + +#include "Test_Lapack_gesv.hpp" +#include "Test_Lapack_trtri.hpp" + +#endif // TEST_LAPACK_HPP From a63d094422b98c1dc7709d654f6f895448781786 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 7 Sep 2023 01:24:46 -0600 Subject: [PATCH 033/326] Backup --- cmake/kokkoskernels_components.cmake | 10 + cmake/kokkoskernels_tpls.cmake | 15 ++ lapack/tpls/KokkosLapack_Host_tpl.cpp | 33 +-- lapack/tpls/KokkosLapack_tpl_spec.hpp | 234 ++++++++++++++++++ lapack/unit_test/CMakeLists.txt | 42 ++-- .../unit_test/backends/Test_Serial_Lapack.cpp | 22 ++ sparse/src/KokkosSparse_sptrsv_supernode.hpp | 6 +- 7 files changed, 322 insertions(+), 40 deletions(-) create mode 100644 lapack/tpls/KokkosLapack_tpl_spec.hpp create mode 100644 lapack/unit_test/backends/Test_Serial_Lapack.cpp diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index 1feb5bb8b8..16a784bd1f 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -29,6 +29,13 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to build the blas component. Default: OFF" ) +KOKKOSKERNELS_ADD_OPTION( + "ENABLE_COMPONENT_LAPACK" + OFF + BOOL + "Whether to build the lapack component. Default: OFF" +) + # SPARSE depends on everything else at the moment. KOKKOSKERNELS_ADD_OPTION( "ENABLE_COMPONENT_SPARSE" @@ -67,6 +74,7 @@ ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_LAPACK ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() @@ -74,6 +82,7 @@ ENDIF() IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_LAPACK ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_ODE ON CACHE BOOL "" FORCE) @@ -85,6 +94,7 @@ ENDIF() # but marking it as advanced should hide it from GUIs IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED AND KokkosKernels_ENABLE_COMPONENT_BLAS + AND KokkosKernels_ENABLE_COMPONENT_LAPACK AND KokkosKernels_ENABLE_COMPONENT_GRAPH AND KokkosKernels_ENABLE_COMPONENT_SPARSE AND KokkosKernels_ENABLE_COMPONENT_ODE) diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index f650168757..6496487081 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -440,6 +440,20 @@ IF ("${F77_BLAS_MANGLE}" STREQUAL "") ENDIF() ENDIF() +# AquiEEP +IF ("${F77_LAPACK_MANGLE}" STREQUAL "") + IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) + ENABLE_LANGUAGE(C) + ENABLE_LANGUAGE(Fortran) + INCLUDE(FortranCInterface) + IF (FortranCInterface_GLOBAL_SUFFIX STREQUAL "") + SET(F77_LAPACK_MANGLE "(name,NAME) ${FortranCInterface_GLOBAL_PREFIX}name") + ELSE () + SET(F77_LAPACK_MANGLE "(name,NAME) ${FortranCInterface_GLOBAL_PREFIX}name ## ${FortranCInterface_GLOBAL_SUFFIX}") + ENDIF () + ENDIF() +ENDIF() + KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_CUDA_TPLS OFF BOOL "Whether CUDA TPLs should be enabled by default. Default: OFF") SET(CUBLAS_DEFAULT ${KOKKOS_ENABLE_CUDA}) SET(CUSPARSE_DEFAULT ${KOKKOS_ENABLE_CUDA}) @@ -466,6 +480,7 @@ KOKKOSKERNELS_ADD_TPL_OPTION(ROCBLAS ${ROCBLAS_DEFAULT} "Whether to enable KOKKOSKERNELS_ADD_TPL_OPTION(ROCSPARSE ${ROCSPARSE_DEFAULT} "Whether to enable ROCSPARSE" DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") +#AquiEEP IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) IF (F77_BLAS_MANGLE STREQUAL "(name,NAME) name ## _") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DADD_ -fopenmp -lgfortran") diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp index 8e7158528e..6ece9fe914 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -29,13 +29,14 @@ extern "C" { /// Gesv /// -void F77_LAPACK_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, +// AquiEEP +void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, int*); -void F77_LAPACK_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, +void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, int*, int*); -void F77_LAPACK_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, +void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); -void F77_LAPACK_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, +void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); /// @@ -49,25 +50,25 @@ void F77_LAPACK_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, &diag, &n, a, &lda, &info); */ -void F77_LAPACK_MANGLE(strtri, STRTRI)(const char*, const char*, int*, +void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, const float*, int*, int*); -void F77_LAPACK_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, +void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, const double*, int*, int*); -void F77_LAPACK_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, +void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, const std::complex*, int*, int*); -void F77_LAPACK_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, +void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, const std::complex*, int*, int*); } -#define F77_FUNC_SGESV F77_LAPACK_MANGLE(sgesv, SGESV) -#define F77_FUNC_DGESV F77_LAPACK_MANGLE(dgesv, DGESV) -#define F77_FUNC_CGESV F77_LAPACK_MANGLE(cgesv, CGESV) -#define F77_FUNC_ZGESV F77_LAPACK_MANGLE(zgesv, ZGESV) +#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) +#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv, DGESV) +#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) +#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) -#define F77_FUNC_STRTRI F77_LAPACK_MANGLE(strtri, STRTRI) -#define F77_FUNC_DTRTRI F77_LAPACK_MANGLE(dtrtri, DTRTRI) -#define F77_FUNC_CTRTRI F77_LAPACK_MANGLE(ctrtri, CTRTRI) -#define F77_FUNC_ZTRTRI F77_LAPACK_MANGLE(ztrtri, ZTRTRI) +#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) +#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) +#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) +#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri, ZTRTRI) namespace KokkosLapack { namespace Impl { diff --git a/lapack/tpls/KokkosLapack_tpl_spec.hpp b/lapack/tpls/KokkosLapack_tpl_spec.hpp new file mode 100644 index 0000000000..a20c5d9a92 --- /dev/null +++ b/lapack/tpls/KokkosLapack_tpl_spec.hpp @@ -0,0 +1,234 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_TPL_SPEC_HPP_ +#define KOKKOSLAPACK_TPL_SPEC_HPP_ + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CULAPACK +#include "cuda_runtime.h" +#include "culapack_v2.h" + +namespace KokkosLapack { +namespace Impl { + +struct CudaLapackSingleton { + culapackHandle_t handle; + + CudaLapackSingleton(); + + static CudaLapackSingleton& singleton(); +}; + +inline void culapack_internal_error_throw(culapackStatus_t culapackState, + const char* name, const char* file, + const int line) { + std::ostringstream out; + // out << name << " error( " << culapackGetStatusName(culapackState) + // << "): " << culapackGetStatusString(culapackState); + out << name << " error( "; + switch (culapackState) { + case CULAPACK_STATUS_NOT_INITIALIZED: + out << "CULAPACK_STATUS_NOT_INITIALIZED): the library was not initialized."; + break; + case CULAPACK_STATUS_ALLOC_FAILED: + out << "CULAPACK_STATUS_ALLOC_FAILED): the resource allocation failed."; + break; + case CULAPACK_STATUS_INVALID_VALUE: + out << "CULAPACK_STATUS_INVALID_VALUE): an invalid numerical value was " + "used as an argument."; + break; + case CULAPACK_STATUS_ARCH_MISMATCH: + out << "CULAPACK_STATUS_ARCH_MISMATCH): an absent device architectural " + "feature is required."; + break; + case CULAPACK_STATUS_MAPPING_ERROR: + out << "CULAPACK_STATUS_MAPPING_ERROR): an access to GPU memory space " + "failed."; + break; + case CULAPACK_STATUS_EXECUTION_FAILED: + out << "CULAPACK_STATUS_EXECUTION_FAILED): the GPU program failed to " + "execute."; + break; + case CULAPACK_STATUS_INTERNAL_ERROR: + out << "CULAPACK_STATUS_INTERNAL_ERROR): an internal operation failed."; + break; + case CULAPACK_STATUS_NOT_SUPPORTED: + out << "CULAPACK_STATUS_NOT_SUPPORTED): the feature required is not " + "supported."; + break; + default: out << "unrecognized error code): this is bad!"; break; + } + if (file) { + out << " " << file << ":" << line; + } + throw std::runtime_error(out.str()); +} + +inline void culapack_internal_safe_call(culapackStatus_t culapackState, + const char* name, + const char* file = nullptr, + const int line = 0) { + if (CULAPACK_STATUS_SUCCESS != culapackState) { + culapack_internal_error_throw(culapackState, name, file, line); + } +} + +// The macro below defines the interface for the safe culapack calls. +// The functions themselves are protected by impl namespace and this +// is not meant to be used by external application or libraries. +#define KOKKOS_CULAPACK_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::culapack_internal_safe_call(call, #call, __FILE__, __LINE__) + +/// \brief This function converts KK transpose mode to cuLAPACK transpose mode +inline culapackOperation_t trans_mode_kk_to_culapack(const char kkMode[]) { + culapackOperation_t trans; + if ((kkMode[0] == 'N') || (kkMode[0] == 'n')) + trans = CULAPACK_OP_N; + else if ((kkMode[0] == 'T') || (kkMode[0] == 't')) + trans = CULAPACK_OP_T; + else + trans = CULAPACK_OP_C; + return trans; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_CULAPACK + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCLAPACK +#include + +namespace KokkosLapack { +namespace Impl { + +struct RocLapackSingleton { + roclapack_handle handle; + + RocLapackSingleton(); + + static RocLapackSingleton& singleton(); +}; + +inline void roclapack_internal_error_throw(roclapack_status roclapackState, + const char* name, const char* file, + const int line) { + std::ostringstream out; + out << name << " error( "; + switch (roclapackState) { + case roclapack_status_invalid_handle: + out << "roclapack_status_invalid_handle): handle not initialized, invalid " + "or null."; + break; + case roclapack_status_not_implemented: + out << "roclapack_status_not_implemented): function is not implemented."; + break; + case roclapack_status_invalid_pointer: + out << "roclapack_status_invalid_pointer): invalid pointer argument."; + break; + case roclapack_status_invalid_size: + out << "roclapack_status_invalid_size): invalid size argument."; + break; + case roclapack_status_memory_error: + out << "roclapack_status_memory_error): failed internal memory allocation, " + "copy or dealloc."; + break; + case roclapack_status_internal_error: + out << "roclapack_status_internal_error): other internal library failure."; + break; + case roclapack_status_perf_degraded: + out << "roclapack_status_perf_degraded): performance degraded due to low " + "device memory."; + break; + case roclapack_status_size_query_mismatch: + out << "unmatched start/stop size query): ."; + break; + case roclapack_status_size_increased: + out << "roclapack_status_size_increased): queried device memory size " + "increased."; + break; + case roclapack_status_size_unchanged: + out << "roclapack_status_size_unchanged): queried device memory size " + "unchanged."; + break; + case roclapack_status_invalid_value: + out << "roclapack_status_invalid_value): passed argument not valid."; + break; + case roclapack_status_continue: + out << "roclapack_status_continue): nothing preventing function to " + "proceed."; + break; + case roclapack_status_check_numerics_fail: + out << "roclapack_status_check_numerics_fail): will be set if the " + "vector/matrix has a NaN or an Infinity."; + break; + default: out << "unrecognized error code): this is bad!"; break; + } + if (file) { + out << " " << file << ":" << line; + } + throw std::runtime_error(out.str()); +} + +inline void roclapack_internal_safe_call(roclapack_status roclapackState, + const char* name, + const char* file = nullptr, + const int line = 0) { + if (roclapack_status_success != roclapackState) { + roclapack_internal_error_throw(roclapackState, name, file, line); + } +} + +// The macro below defines the interface for the safe roclapack calls. +// The functions themselves are protected by impl namespace and this +// is not meant to be used by external application or libraries. +#define KOKKOS_ROCLAPACK_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::roclapack_internal_safe_call(call, #call, __FILE__, __LINE__) + +/// \brief This function converts KK transpose mode to rocLAPACK transpose mode +inline roclapack_operation trans_mode_kk_to_roclapack(const char kkMode[]) { + roclapack_operation trans; + if ((kkMode[0] == 'N') || (kkMode[0] == 'n')) + trans = roclapack_operation_none; + else if ((kkMode[0] == 'T') || (kkMode[0] == 't')) + trans = roclapack_operation_transpose; + else + trans = roclapack_operation_conjugate_transpose; + return trans; +} + +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCLAPACK + +// If LAPACK TPL is enabled, it is preferred over magma's LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "magma_v2.h" + +namespace KokkosLapack { +namespace Impl { + +struct MagmaSingleton { + MagmaSingleton(); + + static MagmaSingleton& singleton(); +}; + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA + +#endif // KOKKOSLAPACK_TPL_SPEC_HPP_ diff --git a/lapack/unit_test/CMakeLists.txt b/lapack/unit_test/CMakeLists.txt index b0ccaf8e7e..a2c2305a12 100644 --- a/lapack/unit_test/CMakeLists.txt +++ b/lapack/unit_test/CMakeLists.txt @@ -17,41 +17,41 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_C ##################### IF (KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_UNIT_TEST( - blas_cuda + lapack_cuda SOURCES ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp - backends/Test_Cuda_Blas.cpp - COMPONENTS blas + backends/Test_Cuda_Lapack.cpp + COMPONENTS lapack ) ENDIF () IF (KOKKOS_ENABLE_HIP) KOKKOSKERNELS_ADD_UNIT_TEST( - blas_hip + lapack_hip SOURCES ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp - backends/Test_HIP_Blas.cpp - COMPONENTS blas + backends/Test_HIP_Lapack.cpp + COMPONENTS lapack ) ENDIF () IF (KOKKOS_ENABLE_SYCL) KOKKOSKERNELS_ADD_UNIT_TEST( - blas_sycl + lapack_sycl SOURCES ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp - backends/Test_SYCL_Blas.cpp - COMPONENTS blas + backends/Test_SYCL_Lapack.cpp + COMPONENTS lapack ) ENDIF () IF (KOKKOS_ENABLE_OPENMPTARGET) # KOKKOSKERNELS_ADD_UNIT_TEST( - # blas_openmptarget + # lapack_openmptarget # SOURCES # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp - # backends/Test_OpenMPTarget_Blas.cpp - # COMPONENTS blas + # backends/Test_OpenMPTarget_Lapack.cpp + # COMPONENTS lapack # ) ENDIF () @@ -64,31 +64,31 @@ ENDIF () ##################### IF (KOKKOS_ENABLE_SERIAL) KOKKOSKERNELS_ADD_UNIT_TEST( - blas_serial + lapack_serial SOURCES ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp - backends/Test_Serial_Blas.cpp - COMPONENTS blas + backends/Test_Serial_Lapack.cpp + COMPONENTS lapack ) ENDIF () IF (KOKKOS_ENABLE_OPENMP) KOKKOSKERNELS_ADD_UNIT_TEST( - blas_openmp + lapack_openmp SOURCES ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp - backends/Test_OpenMP_Blas.cpp - COMPONENTS blas + backends/Test_OpenMP_Lapack.cpp + COMPONENTS lapack ) ENDIF () IF (KOKKOS_ENABLE_THREADS) KOKKOSKERNELS_ADD_UNIT_TEST( - blas_threads + lapack_threads SOURCES ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp - backends/Test_Threads_Blas.cpp - COMPONENTS blas + backends/Test_Threads_Lapack.cpp + COMPONENTS lapack ) ENDIF () diff --git a/lapack/unit_test/backends/Test_Serial_Lapack.cpp b/lapack/unit_test/backends/Test_Serial_Lapack.cpp new file mode 100644 index 0000000000..d0324b9642 --- /dev/null +++ b/lapack/unit_test/backends/Test_Serial_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SERIAL_LAPACK_CPP +#define TEST_SERIAL_LAPACK_CPP + +#include +#include + +#endif // TEST_SERIAL_LAPACK_CPP diff --git a/sparse/src/KokkosSparse_sptrsv_supernode.hpp b/sparse/src/KokkosSparse_sptrsv_supernode.hpp index 0be3abac08..c6e5d406a7 100644 --- a/sparse/src/KokkosSparse_sptrsv_supernode.hpp +++ b/sparse/src/KokkosSparse_sptrsv_supernode.hpp @@ -27,7 +27,7 @@ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) #include "KokkosBlas3_trmm.hpp" -#include "KokkosBlas_trtri.hpp" +#include "KokkosLapack_trtri.hpp" #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" @@ -1472,12 +1472,12 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, // call trtri on device auto dViewLjj = Kokkos::subview(dViewL, range_type(0, nscol), Kokkos::ALL()); - KokkosBlas::trtri(&uplo_char, &diag_char, dViewLjj); + KokkosLapack::trtri(&uplo_char, &diag_char, dViewLjj); } else #endif { // call trtri on host - KokkosBlas::trtri(&uplo_char, &diag_char, Ljj); + KokkosLapack::trtri(&uplo_char, &diag_char, Ljj); } #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE time1 += timer.seconds(); From 8aecf38948f9c4e42804674a0afb9c44d5d677b8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 7 Sep 2023 01:59:16 -0600 Subject: [PATCH 034/326] Backup --- .../unit_test/backends/Test_Cuda_Lapack.cpp | 22 +++++++++++++++++++ .../unit_test/backends/Test_OpenMP_Lapack.cpp | 22 +++++++++++++++++++ .../backends/Test_Threads_Lapack.cpp | 22 +++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 lapack/unit_test/backends/Test_Cuda_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_OpenMP_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_Threads_Lapack.cpp diff --git a/lapack/unit_test/backends/Test_Cuda_Lapack.cpp b/lapack/unit_test/backends/Test_Cuda_Lapack.cpp new file mode 100644 index 0000000000..d75988ef81 --- /dev/null +++ b/lapack/unit_test/backends/Test_Cuda_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_CUDA_LAPACK_CPP +#define TEST_CUDA_LAPACK_CPP + +#include +#include + +#endif // TEST_CUDA_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp b/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp new file mode 100644 index 0000000000..533580fd23 --- /dev/null +++ b/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMP_LAPACK_CPP +#define TEST_OPENMP_LAPACK_CPP + +#include +#include + +#endif // TEST_OPENMP_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_Threads_Lapack.cpp b/lapack/unit_test/backends/Test_Threads_Lapack.cpp new file mode 100644 index 0000000000..aa1acbcf6c --- /dev/null +++ b/lapack/unit_test/backends/Test_Threads_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_THREADS_LAPACK_CPP +#define TEST_THREADS_LAPACK_CPP + +#include +#include + +#endif // TEST_THREADS_LAPACK_CPP From 872a553e01981cd213806406b452bf900caf0b9e Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 3 Oct 2023 11:43:16 -0600 Subject: [PATCH 035/326] Backup --- lapack/unit_test/Test_Lapack_gesv.hpp | 32 +++++++++++++------------- lapack/unit_test/Test_Lapack_trtri.hpp | 32 +++++++++++++------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 25d5089a58..f37770c812 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -343,15 +343,15 @@ int test_gesv_mrhs(const char* mode) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_float"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_float"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -361,15 +361,15 @@ TEST_F(TestCategory, gesv_mrhs_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_double"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_double"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -379,15 +379,15 @@ TEST_F(TestCategory, gesv_mrhs_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_double"); - test_gesv, TestExecSpace>("N"); // No pivoting - test_gesv, TestExecSpace>("Y"); // Partial pivoting + test_gesv, TestDevice>("N"); // No pivoting + test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_double"); - test_gesv_mrhs, TestExecSpace>("N"); // No pivoting - test_gesv_mrhs, TestExecSpace>( + test_gesv_mrhs, TestDevice>("N"); // No pivoting + test_gesv_mrhs, TestDevice>( "Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } @@ -398,15 +398,15 @@ TEST_F(TestCategory, gesv_mrhs_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_float"); - test_gesv, TestExecSpace>("N"); // No pivoting - test_gesv, TestExecSpace>("Y"); // Partial pivoting + test_gesv, TestDevice>("N"); // No pivoting + test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_float"); - test_gesv_mrhs, TestExecSpace>("N"); // No pivoting - test_gesv_mrhs, TestExecSpace>( + test_gesv_mrhs, TestDevice>("N"); // No pivoting + test_gesv_mrhs, TestDevice>( "Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } diff --git a/lapack/unit_test/Test_Lapack_trtri.hpp b/lapack/unit_test/Test_Lapack_trtri.hpp index 498b1248f3..0105803567 100644 --- a/lapack/unit_test/Test_Lapack_trtri.hpp +++ b/lapack/unit_test/Test_Lapack_trtri.hpp @@ -363,10 +363,10 @@ int test_trtri(const char* mode) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_float"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -376,10 +376,10 @@ TEST_F(TestCategory, trtri_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_double"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -389,10 +389,10 @@ TEST_F(TestCategory, trtri_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_double"); - test_trtri, TestExecSpace>("UN"); - test_trtri, TestExecSpace>("UU"); - test_trtri, TestExecSpace>("LN"); - test_trtri, TestExecSpace>("LU"); + test_trtri, TestDevice>("UN"); + test_trtri, TestDevice>("UU"); + test_trtri, TestDevice>("LN"); + test_trtri, TestDevice>("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -402,10 +402,10 @@ TEST_F(TestCategory, trtri_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_float"); - test_trtri, TestExecSpace>("UN"); - test_trtri, TestExecSpace>("UU"); - test_trtri, TestExecSpace>("LN"); - test_trtri, TestExecSpace>("LU"); + test_trtri, TestDevice>("UN"); + test_trtri, TestDevice>("UU"); + test_trtri, TestDevice>("LN"); + test_trtri, TestDevice>("LU"); Kokkos::Profiling::popRegion(); } #endif From 3223af39c87db07b89959bde8b4491939da89d48 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 10 Oct 2023 11:23:11 -0600 Subject: [PATCH 036/326] Backup --- .github/workflows/osx.yml | 1 + CMakeLists.txt | 18 +++++++++++++++++- batched/KokkosBatched_Util.hpp | 1 + ...okkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 3 ++- cm_generate_makefile.bash | 16 +++++++++++++++- cmake/KokkosKernels_config.h.in | 4 ++++ cmake/Modules/FindTPLCUSOLVER.cmake | 18 ++++++++++++++++++ cmake/Modules/FindTPLROCSOLVER.cmake | 12 ++++++++++++ cmake/kokkoskernels_tpls.cmake | 10 ++++++++++ example/half/xpy.cpp | 2 +- lapack/CMakeLists.txt | 12 ++++++------ 11 files changed, 87 insertions(+), 10 deletions(-) create mode 100644 cmake/Modules/FindTPLCUSOLVER.cmake create mode 100644 cmake/Modules/FindTPLROCSOLVER.cmake diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 8d9f7123f8..10688ddb70 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -103,6 +103,7 @@ jobs: -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ .. - name: build_kokkos_kernels diff --git a/CMakeLists.txt b/CMakeLists.txt index 812640374b..85ce79d9ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,7 +193,7 @@ ELSE() "ALL" STRING "A list of components to enable in testing and building" - VALID_ENTRIES BATCHED BLAS LAPACK GRAPH SPARSE ALL + VALID_ENTRIES BATCHED BLAS LAPACK CUSOLVER ROCSOLVER GRAPH SPARSE ALL ) # ================================================================== @@ -245,6 +245,8 @@ ELSE() MESSAGE(" BATCHED: ${KokkosKernels_ENABLE_COMPONENT_BATCHED}") MESSAGE(" BLAS: ${KokkosKernels_ENABLE_COMPONENT_BLAS}") MESSAGE(" LAPACK: ${KokkosKernels_ENABLE_COMPONENT_LAPACK}") + MESSAGE(" CUSOLVER: ${KokkosKernels_ENABLE_COMPONENT_CUSOLVER}") + MESSAGE(" ROCSOLVER: ${KokkosKernels_ENABLE_COMPONENT_ROCSOLVER}") MESSAGE(" GRAPH: ${KokkosKernels_ENABLE_COMPONENT_GRAPH}") MESSAGE(" SPARSE: ${KokkosKernels_ENABLE_COMPONENT_SPARSE}") MESSAGE(" ODE: ${KokkosKernels_ENABLE_COMPONENT_ODE}") @@ -292,6 +294,12 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) INCLUDE(lapack/CMakeLists.txt) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_CUSOLVER) + INCLUDE(lapack/CMakeLists.txt) + ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_ROCSOLVER) + INCLUDE(lapack/CMakeLists.txt) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) INCLUDE(graph/CMakeLists.txt) ENDIF() @@ -374,8 +382,10 @@ ELSE() KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CHOLMOD) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS) + KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCBLAS) + KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL) @@ -413,6 +423,12 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_CUSOLVER) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) + ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_ROCSOLVER) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) ENDIF() diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 9078281e59..04e48f8c92 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -21,6 +21,7 @@ // no experimental name space guard for trilinos #define __KOKKOSBATCHED_PROMOTION__ 1 +#include #include #include #include diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index 5ff581bb64..fd63180cf7 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -16,6 +16,7 @@ #ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ #define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ #include "KokkosBatched_Gemm_Decl.hpp" +#include namespace KokkosBatched { namespace Impl { @@ -181,4 +182,4 @@ class BatchedSerialGemm { }; } // namespace Impl } // namespace KokkosBatched -#endif \ No newline at end of file +#endif diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 3358ae2eb8..4347eb4b3b 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -177,8 +177,10 @@ get_kernels_tpls_list() { KOKKOSKERNELS_USER_TPL_PATH_CMD= KOKKOSKERNELS_USER_TPL_LIBNAME_CMD= CUBLAS_DEFAULT=OFF + CUSOLVER_DEFAULT=OFF CUSPARSE_DEFAULT=OFF ROCBLAS_DEFAULT=OFF + ROCSOLVER_DEFAULT=OFF ROCSPARSE_DEFAULT=OFF PARSE_TPLS_LIST=$(echo $KOKKOSKERNELS_TPLS | tr "," "\n") for TPLS_ in $PARSE_TPLS_LIST @@ -188,12 +190,18 @@ get_kernels_tpls_list() { if [ "$UC_TPLS" == "CUBLAS" ]; then CUBLAS_DEFAULT=ON fi + if [ "$UC_TPLS" == "CUSOLVER" ]; then + CUSOLVER_DEFAULT=ON + fi if [ "$UC_TPLS" == "CUSPARSE" ]; then CUSPARSE_DEFAULT=ON fi if [ "$UC_TPLS" == "ROCBLAS" ]; then ROCBLAS_DEFAULT=ON fi + if [ "$UC_TPLS" == "ROCSOLVER" ]; then + ROCSOLVER_DEFAULT=ON + fi if [ "$UC_TPLS" == "ROCSPARSE" ]; then ROCSPARSE_DEFAULT=ON fi @@ -221,12 +229,18 @@ get_kernels_tpls_list() { if [ "$CUBLAS_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUBLAS=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi + if [ "$CUSOLVER_DEFAULT" == "OFF" ]; then + KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF ${KOKKOSKERNELS_TPLS_CMD}" + fi if [ "$CUSPARSE_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi if [ "$ROCBLAS_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi + if [ "$ROCSOLVER_DEFAULT" == "OFF" ]; then + KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCSOLVER=OFF ${KOKKOSKERNELS_TPLS_CMD}" + fi if [ "$ROCSPARSE_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi @@ -345,7 +359,7 @@ display_help_text() { echo "--with-tpls=[TPLS]: Set tpls to be instantiated (Proper support requies that appropriate compiler and device must be enabled)." echo " This may require providing paths and the library name if using custom installs not on a default path" echo " that CMake searches" - echo " Options: blas, mkl, cublas, cusparse, magma, armpl, rocblas, rocsparse" + echo " Options: blas, mkl, cublas, cusolver, cusparse, magma, armpl, rocblas, rocsolver, rocsparse" echo "--user-blas-path=[PATH]: Set path to location of user-specified BLAS library." echo "--user-blas-lib=[LIB]: Library name of desired BLAS install." echo " Example: For the typical \"libblas.a\" provide \"blas\"" diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 621c78bfcc..bf063e7b63 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -117,6 +117,8 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSPARSE /* CUBLAS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUBLAS +/* CUSOLVER */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSOLVER /* MAGMA */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MAGMA /* SuperLU */ @@ -135,6 +137,8 @@ #cmakedefine ARMPL_BUILD @ARMPL_BUILD@ /* ROCBLAS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +/* ROCSOLVER */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER /* ROCSPARSE */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE diff --git a/cmake/Modules/FindTPLCUSOLVER.cmake b/cmake/Modules/FindTPLCUSOLVER.cmake new file mode 100644 index 0000000000..e10d46e58c --- /dev/null +++ b/cmake/Modules/FindTPLCUSOLVER.cmake @@ -0,0 +1,18 @@ +FIND_PACKAGE(CUDA) + +INCLUDE(FindPackageHandleStandardArgs) +IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUSOLVER + #The eventual target is named CUSOLVER. To avoid naming conflicts + #the find module is called TPLCUSOLVER. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_FOUND) +ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_CUSOLVER_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_CUSOLVER_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSOLVER INTERFACE + LINK_LIBRARIES "${CUDA_CUSOLVER_LIBRARIES}") + ENDIF() +ENDIF() diff --git a/cmake/Modules/FindTPLROCSOLVER.cmake b/cmake/Modules/FindTPLROCSOLVER.cmake new file mode 100644 index 0000000000..c4389f7bae --- /dev/null +++ b/cmake/Modules/FindTPLROCSOLVER.cmake @@ -0,0 +1,12 @@ +FIND_PACKAGE(ROCSOLVER) +if(TARGET roc::rocsolver) +## MPL: 12/29/2022: Variable TPL_ROCSOLVER_IMPORTED_NAME follows the requested convention +## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) + SET(TPL_ROCSOLVER_IMPORTED_NAME roc::rocsolver) + SET(TPL_IMPORTED_NAME roc::rocsolver) +## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention +## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) + ADD_LIBRARY(KokkosKernels::ROCSOLVER ALIAS roc::rocsolver) +ELSE() + MESSAGE(FATAL_ERROR "Package ROCSOLVER requested but not found") +ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 6496487081..9584e028cd 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -456,27 +456,35 @@ ENDIF() KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_CUDA_TPLS OFF BOOL "Whether CUDA TPLs should be enabled by default. Default: OFF") SET(CUBLAS_DEFAULT ${KOKKOS_ENABLE_CUDA}) +SET(CUSOLVER_DEFAULT ${KOKKOS_ENABLE_CUDA}) SET(CUSPARSE_DEFAULT ${KOKKOS_ENABLE_CUDA}) IF(KOKKOSKERNELS_NO_DEFAULT_CUDA_TPLS) SET(CUBLAS_DEFAULT OFF) + SET(CUSOLVER_DEFAULT OFF) SET(CUSPARSE_DEFAULT OFF) ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(CUBLAS ${CUBLAS_DEFAULT} "Whether to enable CUBLAS" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") +KOKKOSKERNELS_ADD_TPL_OPTION(CUSOLVER ${CUSOLVER_DEFAULT} "Whether to enable CUSOLVER" + DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_TPL_OPTION(CUSPARSE ${CUSPARSE_DEFAULT} "Whether to enable CUSPARSE" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_ROCM_TPLS OFF BOOL "Whether ROCM TPLs should be enabled by default. Default: OFF") # Unlike CUDA, ROCm does not automatically install these TPLs SET(ROCBLAS_DEFAULT OFF) +SET(ROCSOLVER_DEFAULT OFF) SET(ROCSPARSE_DEFAULT OFF) # Since the default is OFF we do not really need this piece of logic here. # IF(KOKKOSKERNELS_NO_DEFAULT_ROCM_TPLS) # SET(ROCBLAS_DEFAULT OFF) +# SET(ROCSOLVER_DEFAULT OFF) # SET(ROCSPARSE_DEFAULT OFF) # ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(ROCBLAS ${ROCBLAS_DEFAULT} "Whether to enable ROCBLAS" DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") +KOKKOSKERNELS_ADD_TPL_OPTION(ROCSOLVER ${ROCSOLVER_DEFAULT} "Whether to enable ROCSOLVER" + DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_TPL_OPTION(ROCSPARSE ${ROCSPARSE_DEFAULT} "Whether to enable ROCSPARSE" DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") @@ -508,6 +516,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(LAPACK) KOKKOSKERNELS_IMPORT_TPL(MKL) KOKKOSKERNELS_IMPORT_TPL(CUBLAS) + KOKKOSKERNELS_IMPORT_TPL(CUSOLVER) KOKKOSKERNELS_IMPORT_TPL(CUSPARSE) KOKKOSKERNELS_IMPORT_TPL(CBLAS) KOKKOSKERNELS_IMPORT_TPL(LAPACKE) @@ -517,6 +526,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(ARMPL) KOKKOSKERNELS_IMPORT_TPL(MAGMA) KOKKOSKERNELS_IMPORT_TPL(ROCBLAS) + KOKKOSKERNELS_IMPORT_TPL(ROCSOLVER) KOKKOSKERNELS_IMPORT_TPL(ROCSPARSE) ELSE () IF (Trilinos_ENABLE_SuperLU5_API) diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index 238fdef187..92c422cfe8 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -109,4 +109,4 @@ int main(int argc, char **argv) { do_xpy(n, time_only); Kokkos::finalize(); return 0; -} \ No newline at end of file +} diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 0f38d0aa50..8ab784a325 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -8,11 +8,11 @@ LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/tpls) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/lapack) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/lapack) -####################### -# # +######################### +# # # Logic for LAPACK TPLs # -# # -####################### +# # +######################### #Include LAPACK, Lapack host wrapper IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) @@ -28,14 +28,14 @@ IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKER ENDIF() # Include cuda lapack TPL source file -IF (KOKKOSKERNELS_ENABLE_TPL_CULAPACK) +IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) LIST(APPEND SOURCES lapack/tpls/KokkosLapack_Cuda_tpl.cpp ) ENDIF() # Include rocm lapack TPL source file -IF (KOKKOSKERNELS_ENABLE_TPL_ROCLAPACK) +IF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER) LIST(APPEND SOURCES lapack/tpls/KokkosLapack_Rocm_tpl.cpp ) From b06bf0f370ab88276807f245c33bb7cb87c7d3d8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 10 Oct 2023 13:22:48 -0600 Subject: [PATCH 037/326] Backup --- lapack/tpls/KokkosLapack_Cuda_tpl.cpp | 18 ++ lapack/tpls/KokkosLapack_Cuda_tpl.hpp | 64 +++++++ lapack/tpls/KokkosLapack_tpl_spec.hpp | 165 +++++++++--------- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 1 + 4 files changed, 166 insertions(+), 82 deletions(-) create mode 100644 lapack/tpls/KokkosLapack_Cuda_tpl.cpp create mode 100644 lapack/tpls/KokkosLapack_Cuda_tpl.hpp diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.cpp b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp new file mode 100644 index 0000000000..2ac28871a4 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp @@ -0,0 +1,18 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#include +#include +#include diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp new file mode 100644 index 0000000000..b59d6d99c8 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_CUDA_TPL_HPP_ +#define KOKKOSLAPACK_CUDA_TPL_HPP_ + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +#include + +namespace KokkosLapack { +namespace Impl { + +CudaLapackSingleton::CudaLapackSingleton() { + cusolverStatus_t stat = cusolverDnCreate(&handle); + if (stat != CUSOLVER_STATUS_SUCCESS) + Kokkos::abort("CUSOLVER initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); +} + +CudaLapackSingleton& CudaLapackSingleton::singleton() { + static CudaLapackSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#include + +namespace KokkosLapack { +namespace Impl { + +MagmaSingleton::MagmaSingleton() { + magma_int_t stat = magma_init(); + if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { magma_finalize(); }); +} + +MagmaSingleton& MagmaSingleton::singleton() { + static MagmaSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + +#endif // KOKKOSLAPACK_CUDA_TPL_HPP_ diff --git a/lapack/tpls/KokkosLapack_tpl_spec.hpp b/lapack/tpls/KokkosLapack_tpl_spec.hpp index a20c5d9a92..7bfffc780f 100644 --- a/lapack/tpls/KokkosLapack_tpl_spec.hpp +++ b/lapack/tpls/KokkosLapack_tpl_spec.hpp @@ -17,56 +17,57 @@ #ifndef KOKKOSLAPACK_TPL_SPEC_HPP_ #define KOKKOSLAPACK_TPL_SPEC_HPP_ -#ifdef KOKKOSKERNELS_ENABLE_TPL_CULAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER #include "cuda_runtime.h" -#include "culapack_v2.h" +//#include "cublas_v2.h" +#include "cusolverDn.h" namespace KokkosLapack { namespace Impl { struct CudaLapackSingleton { - culapackHandle_t handle; + cusolverDnHandle_t handle; CudaLapackSingleton(); static CudaLapackSingleton& singleton(); }; -inline void culapack_internal_error_throw(culapackStatus_t culapackState, +inline void cusolver_internal_error_throw(cusolverStatus_t cusolverState, const char* name, const char* file, const int line) { std::ostringstream out; - // out << name << " error( " << culapackGetStatusName(culapackState) - // << "): " << culapackGetStatusString(culapackState); + // out << name << " error( " << cusolverGetStatusName(cusolverState) + // << "): " << cusolverGetStatusString(cusolverState); out << name << " error( "; - switch (culapackState) { - case CULAPACK_STATUS_NOT_INITIALIZED: - out << "CULAPACK_STATUS_NOT_INITIALIZED): the library was not initialized."; + switch (cusolverState) { + case CUSOLVER_STATUS_NOT_INITIALIZED: + out << "CUSOLVER_STATUS_NOT_INITIALIZED): the library was not initialized."; break; - case CULAPACK_STATUS_ALLOC_FAILED: - out << "CULAPACK_STATUS_ALLOC_FAILED): the resource allocation failed."; + case CUSOLVER_STATUS_ALLOC_FAILED: + out << "CUSOLVER_STATUS_ALLOC_FAILED): the resource allocation failed."; break; - case CULAPACK_STATUS_INVALID_VALUE: - out << "CULAPACK_STATUS_INVALID_VALUE): an invalid numerical value was " + case CUSOLVER_STATUS_INVALID_VALUE: + out << "CUSOLVER_STATUS_INVALID_VALUE): an invalid numerical value was " "used as an argument."; break; - case CULAPACK_STATUS_ARCH_MISMATCH: - out << "CULAPACK_STATUS_ARCH_MISMATCH): an absent device architectural " + case CUSOLVER_STATUS_ARCH_MISMATCH: + out << "CUSOLVER_STATUS_ARCH_MISMATCH): an absent device architectural " "feature is required."; break; - case CULAPACK_STATUS_MAPPING_ERROR: - out << "CULAPACK_STATUS_MAPPING_ERROR): an access to GPU memory space " + case CUSOLVER_STATUS_MAPPING_ERROR: + out << "CUSOLVER_STATUS_MAPPING_ERROR): an access to GPU memory space " "failed."; break; - case CULAPACK_STATUS_EXECUTION_FAILED: - out << "CULAPACK_STATUS_EXECUTION_FAILED): the GPU program failed to " + case CUSOLVER_STATUS_EXECUTION_FAILED: + out << "CUSOLVER_STATUS_EXECUTION_FAILED): the GPU program failed to " "execute."; break; - case CULAPACK_STATUS_INTERNAL_ERROR: - out << "CULAPACK_STATUS_INTERNAL_ERROR): an internal operation failed."; + case CUSOLVER_STATUS_INTERNAL_ERROR: + out << "CUSOLVER_STATUS_INTERNAL_ERROR): an internal operation failed."; break; - case CULAPACK_STATUS_NOT_SUPPORTED: - out << "CULAPACK_STATUS_NOT_SUPPORTED): the feature required is not " + case CUSOLVER_STATUS_NOT_SUPPORTED: + out << "CUSOLVER_STATUS_NOT_SUPPORTED): the feature required is not " "supported."; break; default: out << "unrecognized error code): this is bad!"; break; @@ -77,101 +78,101 @@ inline void culapack_internal_error_throw(culapackStatus_t culapackState, throw std::runtime_error(out.str()); } -inline void culapack_internal_safe_call(culapackStatus_t culapackState, +inline void cusolver_internal_safe_call(cusolverStatus_t cusolverState, const char* name, const char* file = nullptr, const int line = 0) { - if (CULAPACK_STATUS_SUCCESS != culapackState) { - culapack_internal_error_throw(culapackState, name, file, line); + if (CUSOLVER_STATUS_SUCCESS != cusolverState) { + cusolver_internal_error_throw(cusolverState, name, file, line); } } -// The macro below defines the interface for the safe culapack calls. +// The macro below defines the interface for the safe cusolver calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_CULAPACK_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::culapack_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, __LINE__) -/// \brief This function converts KK transpose mode to cuLAPACK transpose mode -inline culapackOperation_t trans_mode_kk_to_culapack(const char kkMode[]) { - culapackOperation_t trans; +/// \brief This function converts KK transpose mode to cusolver transpose mode +inline cublasOperation_t trans_mode_kk_to_cusolver(const char kkMode[]) { + cublasOperation_t trans; if ((kkMode[0] == 'N') || (kkMode[0] == 'n')) - trans = CULAPACK_OP_N; + trans = CUBLAS_OP_N; else if ((kkMode[0] == 'T') || (kkMode[0] == 't')) - trans = CULAPACK_OP_T; + trans = CUBLAS_OP_T; else - trans = CULAPACK_OP_C; + trans = CUBLAS_OP_C; return trans; } } // namespace Impl } // namespace KokkosLapack -#endif // KOKKOSKERNELS_ENABLE_TPL_CULAPACK +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCLAPACK -#include +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#include namespace KokkosLapack { namespace Impl { -struct RocLapackSingleton { - roclapack_handle handle; +struct RocsolverSingleton { + rocsolver_handle handle; - RocLapackSingleton(); + RocsolverSingleton(); - static RocLapackSingleton& singleton(); + static RocsolverSingleton& singleton(); }; -inline void roclapack_internal_error_throw(roclapack_status roclapackState, +inline void rocsolver_internal_error_throw(rocsolver_status rocsolverState, const char* name, const char* file, const int line) { std::ostringstream out; out << name << " error( "; - switch (roclapackState) { - case roclapack_status_invalid_handle: - out << "roclapack_status_invalid_handle): handle not initialized, invalid " + switch (rocsolverState) { + case rocsolver_status_invalid_handle: + out << "rocsolver_status_invalid_handle): handle not initialized, invalid " "or null."; break; - case roclapack_status_not_implemented: - out << "roclapack_status_not_implemented): function is not implemented."; + case rocsolver_status_not_implemented: + out << "rocsolver_status_not_implemented): function is not implemented."; break; - case roclapack_status_invalid_pointer: - out << "roclapack_status_invalid_pointer): invalid pointer argument."; + case rocsolver_status_invalid_pointer: + out << "rocsolver_status_invalid_pointer): invalid pointer argument."; break; - case roclapack_status_invalid_size: - out << "roclapack_status_invalid_size): invalid size argument."; + case rocsolver_status_invalid_size: + out << "rocsolver_status_invalid_size): invalid size argument."; break; - case roclapack_status_memory_error: - out << "roclapack_status_memory_error): failed internal memory allocation, " + case rocsolver_status_memory_error: + out << "rocsolver_status_memory_error): failed internal memory allocation, " "copy or dealloc."; break; - case roclapack_status_internal_error: - out << "roclapack_status_internal_error): other internal library failure."; + case rocsolver_status_internal_error: + out << "rocsolver_status_internal_error): other internal library failure."; break; - case roclapack_status_perf_degraded: - out << "roclapack_status_perf_degraded): performance degraded due to low " + case rocsolver_status_perf_degraded: + out << "rocsolver_status_perf_degraded): performance degraded due to low " "device memory."; break; - case roclapack_status_size_query_mismatch: + case rocsolver_status_size_query_mismatch: out << "unmatched start/stop size query): ."; break; - case roclapack_status_size_increased: - out << "roclapack_status_size_increased): queried device memory size " + case rocsolver_status_size_increased: + out << "rocsolver_status_size_increased): queried device memory size " "increased."; break; - case roclapack_status_size_unchanged: - out << "roclapack_status_size_unchanged): queried device memory size " + case rocsolver_status_size_unchanged: + out << "rocsolver_status_size_unchanged): queried device memory size " "unchanged."; break; - case roclapack_status_invalid_value: - out << "roclapack_status_invalid_value): passed argument not valid."; + case rocsolver_status_invalid_value: + out << "rocsolver_status_invalid_value): passed argument not valid."; break; - case roclapack_status_continue: - out << "roclapack_status_continue): nothing preventing function to " + case rocsolver_status_continue: + out << "rocsolver_status_continue): nothing preventing function to " "proceed."; break; - case roclapack_status_check_numerics_fail: - out << "roclapack_status_check_numerics_fail): will be set if the " + case rocsolver_status_check_numerics_fail: + out << "rocsolver_status_check_numerics_fail): will be set if the " "vector/matrix has a NaN or an Infinity."; break; default: out << "unrecognized error code): this is bad!"; break; @@ -182,37 +183,37 @@ inline void roclapack_internal_error_throw(roclapack_status roclapackState, throw std::runtime_error(out.str()); } -inline void roclapack_internal_safe_call(roclapack_status roclapackState, +inline void rocsolver_internal_safe_call(rocsolver_status rocsolverState, const char* name, const char* file = nullptr, const int line = 0) { - if (roclapack_status_success != roclapackState) { - roclapack_internal_error_throw(roclapackState, name, file, line); + if (rocsolver_status_success != rocsolverState) { + rocsolver_internal_error_throw(rocsolverState, name, file, line); } } -// The macro below defines the interface for the safe roclapack calls. +// The macro below defines the interface for the safe rocsolver calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_ROCLAPACK_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::roclapack_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOS_ROCSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::rocsolver_internal_safe_call(call, #call, __FILE__, __LINE__) -/// \brief This function converts KK transpose mode to rocLAPACK transpose mode -inline roclapack_operation trans_mode_kk_to_roclapack(const char kkMode[]) { - roclapack_operation trans; +/// \brief This function converts KK transpose mode to rocsolver transpose mode +inline rocsolver_operation trans_mode_kk_to_rocsolver(const char kkMode[]) { + rocsolver_operation trans; if ((kkMode[0] == 'N') || (kkMode[0] == 'n')) - trans = roclapack_operation_none; + trans = rocsolver_operation_none; else if ((kkMode[0] == 'T') || (kkMode[0] == 't')) - trans = roclapack_operation_transpose; + trans = rocsolver_operation_transpose; else - trans = roclapack_operation_conjugate_transpose; + trans = rocsolver_operation_conjugate_transpose; return trans; } } // namespace Impl } // namespace KokkosLapack -#endif // KOKKOSKERNELS_ENABLE_TPL_ROCLAPACK +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER // If LAPACK TPL is enabled, it is preferred over magma's LAPACK #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index f28e04e26b..157e21dca7 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -20,6 +20,7 @@ #include #include "KokkosKernels_Controls.hpp" +#include #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE From ca4a9431015b2df307419bddcb460a859e04a5e2 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 14 Oct 2023 14:04:02 -0600 Subject: [PATCH 038/326] Backup --- batched/KokkosBatched_Util.hpp | 1 - batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 04e48f8c92..9078281e59 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -21,7 +21,6 @@ // no experimental name space guard for trilinos #define __KOKKOSBATCHED_PROMOTION__ 1 -#include #include #include #include diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index fd63180cf7..35821318b6 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -16,7 +16,6 @@ #ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ #define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ #include "KokkosBatched_Gemm_Decl.hpp" -#include namespace KokkosBatched { namespace Impl { From db8d1c0351db9065304898632ae67cdab69f69ef Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 14 Oct 2023 14:17:13 -0600 Subject: [PATCH 039/326] Backup --- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 157e21dca7..f28e04e26b 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -20,7 +20,6 @@ #include #include "KokkosKernels_Controls.hpp" -#include #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE From f648609285ebb44ea2fa864f6878895750ac27fb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 2 Oct 2023 08:12:35 -0600 Subject: [PATCH 040/326] Address CI build errors --- blas/impl/KokkosBlas2_gemv_impl.hpp | 79 +++++++++---------- blas/impl/KokkosBlas2_gemv_spec.hpp | 6 +- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 12 ++- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 16 ++-- 4 files changed, 58 insertions(+), 55 deletions(-) diff --git a/blas/impl/KokkosBlas2_gemv_impl.hpp b/blas/impl/KokkosBlas2_gemv_impl.hpp index 730f88602a..dc0f531583 100644 --- a/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -199,10 +199,9 @@ struct SingleLevelTransposeGEMV { }; // Single-level parallel version of GEMV. -template -void singleLevelGemv(const typename AViewType::execution_space& space, - const char trans[], +template +void singleLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -222,9 +221,8 @@ void singleLevelGemv(const typename AViewType::execution_space& space, static_assert(std::is_integral::value, "IndexType must be an integer"); - using y_value_type = typename YViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::RangePolicy; + using y_value_type = typename YViewType::non_const_value_type; + using policy_type = Kokkos::RangePolicy; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; @@ -442,8 +440,8 @@ struct TwoLevelGEMV_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- // Functor for a two-level parallel_reduce version of GEMV (non-transpose), // designed for performance on GPU. Kernel depends on the layout of A. -template +template struct TwoLevelGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -453,9 +451,8 @@ struct TwoLevelGEMV { std::is_same::value, float, y_value_type>::type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -564,7 +561,8 @@ struct TwoLevelGEMV { // transpose GEMV. The functor uses parallel-for over the columns of the input // matrix A and each team uses parallel-reduce over the row of its column. // The output vector y is the reduction result. -template struct TwoLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; @@ -575,9 +573,8 @@ struct TwoLevelTransposeGEMV { std::is_same::value, float, y_value_type>::type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -637,10 +634,9 @@ struct TwoLevelTransposeGEMV { }; // Two-level parallel version of GEMV. -template -void twoLevelGemv(const typename AViewType::execution_space& space, - const char trans[], +template +void twoLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -661,9 +657,8 @@ void twoLevelGemv(const typename AViewType::execution_space& space, "IndexType must be an integer"); using y_value_type = typename YViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using team_policy_type = Kokkos::TeamPolicy; - using range_policy_type = Kokkos::RangePolicy; + using team_policy_type = Kokkos::TeamPolicy; + using range_policy_type = Kokkos::RangePolicy; using Kokkos::ArithTraits; using KAT = ArithTraits; @@ -704,19 +699,19 @@ void twoLevelGemv(const typename AViewType::execution_space& space, using layout_tag = typename std::conditional::type; - using tagged_policy = Kokkos::TeamPolicy; - using functor_type = - TwoLevelGEMV; + using tagged_policy = Kokkos::TeamPolicy; + using functor_type = TwoLevelGEMV; functor_type functor(alpha, A, x, beta, y); tagged_policy team; - if (isLayoutLeft) { + if constexpr (isLayoutLeft) { using AccumScalar = typename std::conditional< std::is_same::value || std::is_same::value, float, y_value_type>::type; size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; - tagged_policy temp(1, 1); + tagged_policy temp(space, 1, 1); temp.set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); int teamSize = temp.team_size_recommended(functor, Kokkos::ParallelForTag()); @@ -727,7 +722,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space, // FIXME SYCL: team_size_recommended() returns too big of a team size. // Kernel hangs with 1024 threads on XEHP. #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same::value) { + if (std::is_same::value) { if (teamSize > 256) teamSize = 256; } #endif @@ -749,16 +744,18 @@ void twoLevelGemv(const typename AViewType::execution_space& space, } else if (tr == 'T') { // transpose, and not conj transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = TwoLevelTransposeGEMV; + using functor_type = + TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); } else if (tr == 'C' || tr == 'H') { // conjugate transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = TwoLevelTransposeGEMV; + using functor_type = + TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); @@ -769,11 +766,11 @@ void twoLevelGemv(const typename AViewType::execution_space& space, // generalGemv: use 1 level (Range) or 2 level (Team) implementation, // depending on whether execution space is CPU or GPU. enable_if makes sure // unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalGemvImpl(const typename AViewType::execution_space& space, - const char trans[], + ExecutionSpace>()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -781,11 +778,11 @@ void generalGemvImpl(const typename AViewType::execution_space& space, singleLevelGemv(space, trans, alpha, A, x, beta, y); } -template ()>::type* = nullptr> -void generalGemvImpl(const typename AViewType::execution_space& space, - const char trans[], + ExecutionSpace>()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index 08842a61c0..97e6e2717e 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -104,10 +104,10 @@ struct GEMV { // Prefer int as the index type, but use a larger type if needed. if (numRows < static_cast(INT_MAX) && numCols < static_cast(INT_MAX)) { - generalGemvImpl(space, trans, alpha, - A, x, beta, y); + generalGemvImpl( + space, trans, alpha, A, x, beta, y); } else { - generalGemvImpl( + generalGemvImpl( space, trans, alpha, A, x, beta, y); } Kokkos::Profiling::popRegion(); diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 0b9afa7796..019a63fcd7 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -159,11 +159,11 @@ void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, std::is_same::value || std::is_same::value; - if (!is_cuda_space) { + if constexpr (!is_cuda_space) { throw std::runtime_error( "KokkosKernels sptrsvcuSPARSE_symbolic: MEMORY IS NOT ALLOCATED IN GPU " "DEVICE for CUSPARSE\n"); - } else if (std::is_same::value) { + } else if constexpr (std::is_same::value) { bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); @@ -277,6 +277,7 @@ void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, } #endif #else + (void)space; (void)sptrsv_handle; (void)nrows; (void)row_map; @@ -369,8 +370,10 @@ void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetStream(h->handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + } int nnz = entries.extent_int(0); @@ -440,6 +443,7 @@ void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, } #endif #else + (void)space; (void)sptrsv_handle; (void)nrows; (void)row_map; diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 1d8613922b..ecc5d13308 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2913,7 +2913,8 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::memory_space; + using memory_space = typename ExecutionSpace::memory_space; + using device_t = Kokkos::Device; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -3075,7 +3076,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, // NOTE: we currently supports only default_layout = LayoutLeft using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = - Kokkos::View; if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to scatter the @@ -3148,7 +3149,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, char unit_diag = (unit_diagonal ? 'U' : 'N'); // NOTE: we currently supports only default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, @@ -3311,6 +3312,7 @@ void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, cudaProfilerStop(); #endif using memory_space = typename ExecutionSpace::memory_space; + using device_t = Kokkos::Device; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3527,7 +3529,7 @@ tstf); } // end elseif // create a view for the s-th supernocal block column // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, nscol); @@ -3562,7 +3564,7 @@ tstf); } // end elseif } else { // NOTE: we currently supports only default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); @@ -3658,7 +3660,7 @@ tstf); } // end elseif // create a view for the s-th supernocal block column // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, nscol); @@ -3695,7 +3697,7 @@ tstf); } // end elseif KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); From 6d582b62d9b136cb91d0065ed44642617c79c3cc Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 24 Oct 2023 22:13:00 -0600 Subject: [PATCH 041/326] Some cleanup on current pull request, making it more related to 'just' the creation of the lapack subdirectory and the moving of some files to there --- .github/workflows/osx.yml | 1 - CMakeLists.txt | 18 +----------------- cm_generate_makefile.bash | 16 +--------------- cmake/KokkosKernels_config.h.in | 6 ------ cmake/Modules/FindTPLCUSOLVER.cmake | 18 ------------------ cmake/Modules/FindTPLROCSOLVER.cmake | 12 ------------ cmake/kokkoskernels_tpls.cmake | 11 ----------- 7 files changed, 2 insertions(+), 80 deletions(-) delete mode 100644 cmake/Modules/FindTPLCUSOLVER.cmake delete mode 100644 cmake/Modules/FindTPLROCSOLVER.cmake diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 10688ddb70..8d9f7123f8 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -103,7 +103,6 @@ jobs: -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ .. - name: build_kokkos_kernels diff --git a/CMakeLists.txt b/CMakeLists.txt index 85ce79d9ab..812640374b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,7 +193,7 @@ ELSE() "ALL" STRING "A list of components to enable in testing and building" - VALID_ENTRIES BATCHED BLAS LAPACK CUSOLVER ROCSOLVER GRAPH SPARSE ALL + VALID_ENTRIES BATCHED BLAS LAPACK GRAPH SPARSE ALL ) # ================================================================== @@ -245,8 +245,6 @@ ELSE() MESSAGE(" BATCHED: ${KokkosKernels_ENABLE_COMPONENT_BATCHED}") MESSAGE(" BLAS: ${KokkosKernels_ENABLE_COMPONENT_BLAS}") MESSAGE(" LAPACK: ${KokkosKernels_ENABLE_COMPONENT_LAPACK}") - MESSAGE(" CUSOLVER: ${KokkosKernels_ENABLE_COMPONENT_CUSOLVER}") - MESSAGE(" ROCSOLVER: ${KokkosKernels_ENABLE_COMPONENT_ROCSOLVER}") MESSAGE(" GRAPH: ${KokkosKernels_ENABLE_COMPONENT_GRAPH}") MESSAGE(" SPARSE: ${KokkosKernels_ENABLE_COMPONENT_SPARSE}") MESSAGE(" ODE: ${KokkosKernels_ENABLE_COMPONENT_ODE}") @@ -294,12 +292,6 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) INCLUDE(lapack/CMakeLists.txt) ENDIF() - IF (KokkosKernels_ENABLE_COMPONENT_CUSOLVER) - INCLUDE(lapack/CMakeLists.txt) - ENDIF() - IF (KokkosKernels_ENABLE_COMPONENT_ROCSOLVER) - INCLUDE(lapack/CMakeLists.txt) - ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) INCLUDE(graph/CMakeLists.txt) ENDIF() @@ -382,10 +374,8 @@ ELSE() KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CHOLMOD) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS) - KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCBLAS) - KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL) @@ -423,12 +413,6 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) ENDIF() - IF (KokkosKernels_ENABLE_COMPONENT_CUSOLVER) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) - ENDIF() - IF (KokkosKernels_ENABLE_COMPONENT_ROCSOLVER) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) - ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) ENDIF() diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 4347eb4b3b..3358ae2eb8 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -177,10 +177,8 @@ get_kernels_tpls_list() { KOKKOSKERNELS_USER_TPL_PATH_CMD= KOKKOSKERNELS_USER_TPL_LIBNAME_CMD= CUBLAS_DEFAULT=OFF - CUSOLVER_DEFAULT=OFF CUSPARSE_DEFAULT=OFF ROCBLAS_DEFAULT=OFF - ROCSOLVER_DEFAULT=OFF ROCSPARSE_DEFAULT=OFF PARSE_TPLS_LIST=$(echo $KOKKOSKERNELS_TPLS | tr "," "\n") for TPLS_ in $PARSE_TPLS_LIST @@ -190,18 +188,12 @@ get_kernels_tpls_list() { if [ "$UC_TPLS" == "CUBLAS" ]; then CUBLAS_DEFAULT=ON fi - if [ "$UC_TPLS" == "CUSOLVER" ]; then - CUSOLVER_DEFAULT=ON - fi if [ "$UC_TPLS" == "CUSPARSE" ]; then CUSPARSE_DEFAULT=ON fi if [ "$UC_TPLS" == "ROCBLAS" ]; then ROCBLAS_DEFAULT=ON fi - if [ "$UC_TPLS" == "ROCSOLVER" ]; then - ROCSOLVER_DEFAULT=ON - fi if [ "$UC_TPLS" == "ROCSPARSE" ]; then ROCSPARSE_DEFAULT=ON fi @@ -229,18 +221,12 @@ get_kernels_tpls_list() { if [ "$CUBLAS_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUBLAS=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi - if [ "$CUSOLVER_DEFAULT" == "OFF" ]; then - KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF ${KOKKOSKERNELS_TPLS_CMD}" - fi if [ "$CUSPARSE_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi if [ "$ROCBLAS_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi - if [ "$ROCSOLVER_DEFAULT" == "OFF" ]; then - KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCSOLVER=OFF ${KOKKOSKERNELS_TPLS_CMD}" - fi if [ "$ROCSPARSE_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi @@ -359,7 +345,7 @@ display_help_text() { echo "--with-tpls=[TPLS]: Set tpls to be instantiated (Proper support requies that appropriate compiler and device must be enabled)." echo " This may require providing paths and the library name if using custom installs not on a default path" echo " that CMake searches" - echo " Options: blas, mkl, cublas, cusolver, cusparse, magma, armpl, rocblas, rocsolver, rocsparse" + echo " Options: blas, mkl, cublas, cusparse, magma, armpl, rocblas, rocsparse" echo "--user-blas-path=[PATH]: Set path to location of user-specified BLAS library." echo "--user-blas-lib=[LIB]: Library name of desired BLAS install." echo " Example: For the typical \"libblas.a\" provide \"blas\"" diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index bf063e7b63..b8b66fffbb 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -109,16 +109,12 @@ /* BLAS library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_BLAS -/* LAPACK library */ -#cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL /* CUSPARSE */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSPARSE /* CUBLAS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUBLAS -/* CUSOLVER */ -#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSOLVER /* MAGMA */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MAGMA /* SuperLU */ @@ -137,8 +133,6 @@ #cmakedefine ARMPL_BUILD @ARMPL_BUILD@ /* ROCBLAS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -/* ROCSOLVER */ -#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER /* ROCSPARSE */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE diff --git a/cmake/Modules/FindTPLCUSOLVER.cmake b/cmake/Modules/FindTPLCUSOLVER.cmake deleted file mode 100644 index e10d46e58c..0000000000 --- a/cmake/Modules/FindTPLCUSOLVER.cmake +++ /dev/null @@ -1,18 +0,0 @@ -FIND_PACKAGE(CUDA) - -INCLUDE(FindPackageHandleStandardArgs) -IF (NOT CUDA_FOUND) - #Important note here: this find Module is named TPLCUSOLVER - #The eventual target is named CUSOLVER. To avoid naming conflicts - #the find module is called TPLCUSOLVER. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_FOUND) -ELSE() - #The libraries might be empty - OR they might explicitly be not found - IF("${CUDA_CUSOLVER_LIBRARIES}" MATCHES "NOTFOUND") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_CUSOLVER_LIBRARIES) - ELSE() - KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSOLVER INTERFACE - LINK_LIBRARIES "${CUDA_CUSOLVER_LIBRARIES}") - ENDIF() -ENDIF() diff --git a/cmake/Modules/FindTPLROCSOLVER.cmake b/cmake/Modules/FindTPLROCSOLVER.cmake deleted file mode 100644 index c4389f7bae..0000000000 --- a/cmake/Modules/FindTPLROCSOLVER.cmake +++ /dev/null @@ -1,12 +0,0 @@ -FIND_PACKAGE(ROCSOLVER) -if(TARGET roc::rocsolver) -## MPL: 12/29/2022: Variable TPL_ROCSOLVER_IMPORTED_NAME follows the requested convention -## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) - SET(TPL_ROCSOLVER_IMPORTED_NAME roc::rocsolver) - SET(TPL_IMPORTED_NAME roc::rocsolver) -## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention -## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) - ADD_LIBRARY(KokkosKernels::ROCSOLVER ALIAS roc::rocsolver) -ELSE() - MESSAGE(FATAL_ERROR "Package ROCSOLVER requested but not found") -ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 9584e028cd..be1488e051 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -456,39 +456,30 @@ ENDIF() KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_CUDA_TPLS OFF BOOL "Whether CUDA TPLs should be enabled by default. Default: OFF") SET(CUBLAS_DEFAULT ${KOKKOS_ENABLE_CUDA}) -SET(CUSOLVER_DEFAULT ${KOKKOS_ENABLE_CUDA}) SET(CUSPARSE_DEFAULT ${KOKKOS_ENABLE_CUDA}) IF(KOKKOSKERNELS_NO_DEFAULT_CUDA_TPLS) SET(CUBLAS_DEFAULT OFF) - SET(CUSOLVER_DEFAULT OFF) SET(CUSPARSE_DEFAULT OFF) ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(CUBLAS ${CUBLAS_DEFAULT} "Whether to enable CUBLAS" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") -KOKKOSKERNELS_ADD_TPL_OPTION(CUSOLVER ${CUSOLVER_DEFAULT} "Whether to enable CUSOLVER" - DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_TPL_OPTION(CUSPARSE ${CUSPARSE_DEFAULT} "Whether to enable CUSPARSE" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_ROCM_TPLS OFF BOOL "Whether ROCM TPLs should be enabled by default. Default: OFF") # Unlike CUDA, ROCm does not automatically install these TPLs SET(ROCBLAS_DEFAULT OFF) -SET(ROCSOLVER_DEFAULT OFF) SET(ROCSPARSE_DEFAULT OFF) # Since the default is OFF we do not really need this piece of logic here. # IF(KOKKOSKERNELS_NO_DEFAULT_ROCM_TPLS) # SET(ROCBLAS_DEFAULT OFF) -# SET(ROCSOLVER_DEFAULT OFF) # SET(ROCSPARSE_DEFAULT OFF) # ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(ROCBLAS ${ROCBLAS_DEFAULT} "Whether to enable ROCBLAS" DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") -KOKKOSKERNELS_ADD_TPL_OPTION(ROCSOLVER ${ROCSOLVER_DEFAULT} "Whether to enable ROCSOLVER" - DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_TPL_OPTION(ROCSPARSE ${ROCSPARSE_DEFAULT} "Whether to enable ROCSPARSE" DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") -#AquiEEP IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) IF (F77_BLAS_MANGLE STREQUAL "(name,NAME) name ## _") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DADD_ -fopenmp -lgfortran") @@ -516,7 +507,6 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(LAPACK) KOKKOSKERNELS_IMPORT_TPL(MKL) KOKKOSKERNELS_IMPORT_TPL(CUBLAS) - KOKKOSKERNELS_IMPORT_TPL(CUSOLVER) KOKKOSKERNELS_IMPORT_TPL(CUSPARSE) KOKKOSKERNELS_IMPORT_TPL(CBLAS) KOKKOSKERNELS_IMPORT_TPL(LAPACKE) @@ -526,7 +516,6 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(ARMPL) KOKKOSKERNELS_IMPORT_TPL(MAGMA) KOKKOSKERNELS_IMPORT_TPL(ROCBLAS) - KOKKOSKERNELS_IMPORT_TPL(ROCSOLVER) KOKKOSKERNELS_IMPORT_TPL(ROCSPARSE) ELSE () IF (Trilinos_ENABLE_SuperLU5_API) From e8557be2780b221dee7757b2d14753aa587d1f4f Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 24 Oct 2023 22:16:05 -0600 Subject: [PATCH 042/326] More cleanup --- batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 2 +- example/half/xpy.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index 35821318b6..5ff581bb64 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -181,4 +181,4 @@ class BatchedSerialGemm { }; } // namespace Impl } // namespace KokkosBatched -#endif +#endif \ No newline at end of file diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index 92c422cfe8..238fdef187 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -109,4 +109,4 @@ int main(int argc, char **argv) { do_xpy(n, time_only); Kokkos::finalize(); return 0; -} +} \ No newline at end of file From 7c9ed9e6d3034c0d3e1143b97686cd17f5255e67 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 24 Oct 2023 22:49:07 -0600 Subject: [PATCH 043/326] Re-enabling gesv unit tests under the lapack subdirectory --- cmake/KokkosKernels_config.h.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index b8b66fffbb..4c54a350b3 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -109,6 +109,8 @@ /* BLAS library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_BLAS +/* LAPACKE */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL /* CUSPARSE */ From 6ac5ba3597b7a76c0343360848016f06efaf6fb3 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 24 Oct 2023 23:32:18 -0600 Subject: [PATCH 044/326] Adding BLAS routines back, for backwards compatibility --- blas/src/KokkosBlas_gesv.hpp | 55 +++++++++++++++++++++++++++++++++++ blas/src/KokkosBlas_trtri.hpp | 52 +++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 blas/src/KokkosBlas_gesv.hpp create mode 100644 blas/src/KokkosBlas_trtri.hpp diff --git a/blas/src/KokkosBlas_gesv.hpp b/blas/src/KokkosBlas_gesv.hpp new file mode 100644 index 0000000000..5e224c07e4 --- /dev/null +++ b/blas/src/KokkosBlas_gesv.hpp @@ -0,0 +1,55 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosBlas_gesv.hpp +/// \brief Local dense linear solve +/// +/// This file provides KokkosBlas::gesv. This function performs a +/// local (no MPI) dense linear solve on a system of linear equations +/// A * X = B where A is a general N-by-N matrix and X and B are N-by-NRHS +/// matrices. + +#ifndef KOKKOSBLAS_GESV_HPP_ +#define KOKKOSBLAS_GESV_HPP_ + +#include "KokkosLapack_gesv.hpp" + +namespace KokkosBlas { + +/// \brief Solve the dense linear equation system A*X = B. +/// +/// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View. +/// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a +/// 1-D or 2-D Kokkos::View. \tparam IPIVV Output pivot indices, as a 1-D +/// Kokkos::View +/// +/// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the +/// factors L and U from +/// the factorization A = P*L*U; the unit diagonal elements of L are not +/// stored. +/// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, +/// the solution (multi)vector X. \param IPIV [out] On exit, the pivot indices +/// (for partial pivoting). If the View extents are zero and +/// its data pointer is NULL, pivoting is not used. +/// +template +[[deprecated]] void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { + KokkosLapack::gesv(A,B,IPIV); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS_GESV_HPP_ diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp new file mode 100644 index 0000000000..34ca96b2d4 --- /dev/null +++ b/blas/src/KokkosBlas_trtri.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBLAS_TRTRI_HPP_ +#define KOKKOSBLAS_TRTRI_HPP_ + +/// \file KokkosBlas_trtri.hpp + +#include "KokkosLapack_trtri.hpp" + +namespace KokkosBlas { + +/// \brief Find the inverse of the triangular matrix, A +/// +/// A = inv(A) +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// +/// \param uplo [in] "U" or "u" indicates matrix A is an upper triangular +/// matrix +/// "L" or "l" indicates matrix A is a lower triangular matrix +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit +// "N" or "n" indicates the diagonal of A is assumed to be +// non-unit +/// \param A [in,out] Input matrix, as a 2-D Kokkos::View +/// On entry, A +/// On successful exit, inv(A) +/// \return 0 upon success, +// i if the i-th diagonal elemet of A is zero, A is singular, +// and the inversion could not be completed. +// source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri +template +[[deprecated]] int trtri(const char uplo[], const char diag[], const AViewType& A) { + return KokkosLapack::trtri(uplo, diag, A); +} + +} // namespace KokkosBlas + +#endif // KOKKOS_BLASLAPACK_TRTRI_HPP_ From a62d66640f00803f2765d02abc8efbdcaa03c76f Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 24 Oct 2023 23:51:21 -0600 Subject: [PATCH 045/326] Formatting --- blas/src/KokkosBlas_gesv.hpp | 2 +- blas/src/KokkosBlas_trtri.hpp | 3 +- blas/tpls/KokkosBlas_Host_tpl.cpp | 1 - lapack/impl/KokkosLapack_gesv_spec.hpp | 68 +++---- lapack/impl/KokkosLapack_trtri_spec.hpp | 30 +-- lapack/src/KokkosLapack_gesv.hpp | 17 +- lapack/src/KokkosLapack_trtri.hpp | 4 +- lapack/tpls/KokkosLapack_Host_tpl.cpp | 29 +-- .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 26 +-- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 150 +++++++-------- lapack/tpls/KokkosLapack_tpl_spec.hpp | 42 +++-- .../KokkosLapack_trtri_tpl_spec_avail.hpp | 56 +++--- .../tpls/KokkosLapack_trtri_tpl_spec_decl.hpp | 175 +++++++++--------- lapack/unit_test/Test_Lapack_gesv.hpp | 27 ++- lapack/unit_test/Test_Lapack_trtri.hpp | 4 +- 15 files changed, 326 insertions(+), 308 deletions(-) diff --git a/blas/src/KokkosBlas_gesv.hpp b/blas/src/KokkosBlas_gesv.hpp index 5e224c07e4..1326c6fb8e 100644 --- a/blas/src/KokkosBlas_gesv.hpp +++ b/blas/src/KokkosBlas_gesv.hpp @@ -47,7 +47,7 @@ namespace KokkosBlas { /// template [[deprecated]] void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { - KokkosLapack::gesv(A,B,IPIV); + KokkosLapack::gesv(A, B, IPIV); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp index 34ca96b2d4..d9771e3a16 100644 --- a/blas/src/KokkosBlas_trtri.hpp +++ b/blas/src/KokkosBlas_trtri.hpp @@ -43,7 +43,8 @@ namespace KokkosBlas { // and the inversion could not be completed. // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri template -[[deprecated]] int trtri(const char uplo[], const char diag[], const AViewType& A) { +[[deprecated]] int trtri(const char uplo[], const char diag[], + const AViewType& A) { return KokkosLapack::trtri(uplo, diag, A); } diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 88c3ef7bbd..71e22a690c 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -411,7 +411,6 @@ void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, const std::complex*, const std::complex*, int*, /* */ std::complex*, int*); - } void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, diff --git a/lapack/impl/KokkosLapack_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp index 8ea1df03bf..b9f8549311 100644 --- a/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -43,16 +43,16 @@ struct gesv_eti_spec_avail { // more .cpp files. // #define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct gesv_eti_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct gesv_eti_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -98,33 +98,33 @@ struct GESV { // more .cpp files. // #define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - extern template struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/lapack/impl/KokkosLapack_trtri_spec.hpp b/lapack/impl/KokkosLapack_trtri_spec.hpp index e48b37f7c2..a17184dc41 100644 --- a/lapack/impl/KokkosLapack_trtri_spec.hpp +++ b/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -37,8 +37,8 @@ struct trtri_eti_spec_avail { // This Macros provides the ETI specialization of trtri, currently not // available. // -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ template <> \ struct trtri_eti_spec_avail< \ Kokkos::View { // "extern template" skips the implicit instatiation step ensuring that the // callers code uses this explicit instantiation definition of TRTRI. // -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ - extern template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ + extern template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ - template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ + template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp index b08f523f6e..4c9058f8ab 100644 --- a/lapack/src/KokkosLapack_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -50,10 +50,11 @@ namespace KokkosLapack { /// template void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { - // NOTE: Currently, KokkosLapack::gesv only supports for MAGMA TPL and LAPACK TPL. + // NOTE: Currently, KokkosLapack::gesv only supports for MAGMA TPL and LAPACK + // TPL. // MAGMA TPL should be enabled to call the MAGMA GPU interface for - // device views LAPACK TPL should be enabled to call the LAPACK interface - // for host views + // device views LAPACK TPL should be enabled to call the LAPACK + // interface for host views static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: A must be a Kokkos::View."); @@ -87,8 +88,8 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { } // Check for no pivoting case. Only MAGMA supports no pivoting interface -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL if ((!std::is_same::value) && (IPIV0 == 0) && (IPIV.data() == nullptr)) { @@ -98,7 +99,7 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { KokkosKernels::Impl::throw_runtime_exception(os.str()); } #endif -#else // not have MAGMA TPL +#else // not have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL if ((IPIV0 == 0) && (IPIV.data() == nullptr)) { std::ostringstream os; @@ -137,11 +138,11 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { if (BXMV::rank == 1) { auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + IPIVV_Internal>::gesv(A_i, B_i, IPIV_i); } else { // BXMV::rank == 2 auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + IPIVV_Internal>::gesv(A_i, B_i, IPIV_i); } } diff --git a/lapack/src/KokkosLapack_trtri.hpp b/lapack/src/KokkosLapack_trtri.hpp index 44e8fc9f65..9a884f2303 100644 --- a/lapack/src/KokkosLapack_trtri.hpp +++ b/lapack/src/KokkosLapack_trtri.hpp @@ -108,8 +108,8 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { int result; RViewInternalType R = RViewInternalType(&result); - KokkosLapack::Impl::TRTRI::trtri(R, uplo, - diag, A); + KokkosLapack::Impl::TRTRI::trtri( + R, uplo, diag, A); return result; } diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp index 6ece9fe914..130eaba264 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -79,12 +79,12 @@ namespace Impl { template <> void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, - float* b, int ldb, int info) { + float* b, int ldb, int info) { F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> int HostLapack::trtri(const char uplo, const char diag, int n, - const float* a, int lda) { + const float* a, int lda) { int info = 0; F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); return info; @@ -96,12 +96,12 @@ int HostLapack::trtri(const char uplo, const char diag, int n, template <> void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, - double* b, int ldb, int info) { + double* b, int ldb, int info) { F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> int HostLapack::trtri(const char uplo, const char diag, int n, - const double* a, int lda) { + const double* a, int lda) { int info = 0; F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); return info; @@ -113,15 +113,15 @@ int HostLapack::trtri(const char uplo, const char diag, int n, template <> void HostLapack >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> int HostLapack >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { + int n, const std::complex* a, + int lda) { int info = 0; F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); return info; @@ -133,15 +133,16 @@ int HostLapack >::trtri(const char uplo, const char diag, template <> void HostLapack >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> int HostLapack >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { + int n, + const std::complex* a, + int lda) { int info = 0; F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); return info; diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index 74a65d4cf9..a3d8bb6ee9 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -28,7 +28,7 @@ struct gesv_tpl_spec_avail { // Generic Host side LAPACK (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -39,13 +39,13 @@ struct gesv_tpl_spec_avail { }; KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) + Kokkos::HostSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) @@ -69,7 +69,7 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -80,13 +80,13 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif }; KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) + Kokkos::CudaSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) + Kokkos::CudaSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index dcab48f07b..2baa76a132 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -45,7 +45,7 @@ inline void gesv_print_specialization() { namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_DGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -74,7 +74,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -89,65 +89,65 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), \ + B.data(), LDB, info); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSLAPACK_SGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,float]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View< \ + int*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + PViewType; \ + \ + static void gesv(const AViewType& A, const BViewType& B, \ + const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,float]"); \ + gesv_print_specialization(); \ + const bool with_pivot = \ + !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ + \ + const int N = static_cast(A.extent(1)); \ + const int AST = static_cast(A.stride(1)); \ + const int LDA = (AST == 0) ? 1 : AST; \ + const int BST = static_cast(B.stride(1)); \ + const int LDB = (BST == 0) ? 1 : BST; \ + const int NRHS = static_cast(B.extent(1)); \ + \ + int info = 0; \ + \ + if (with_pivot) { \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ + LDB, info); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSLAPACK_ZGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -178,7 +178,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -193,7 +193,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostLapack >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -202,7 +202,7 @@ namespace Impl { } \ }; -#define KOKKOSLAPACK_CGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -233,7 +233,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -248,7 +248,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostLapack >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -280,7 +280,7 @@ KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -309,7 +309,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -321,8 +321,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -339,7 +339,7 @@ namespace Impl { } \ }; -#define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -368,7 +368,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -380,8 +380,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -398,7 +398,7 @@ namespace Impl { } \ }; -#define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -429,7 +429,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -441,8 +441,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -459,7 +459,7 @@ namespace Impl { } \ }; -#define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -490,7 +490,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -502,8 +502,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ diff --git a/lapack/tpls/KokkosLapack_tpl_spec.hpp b/lapack/tpls/KokkosLapack_tpl_spec.hpp index 7bfffc780f..3aed9533bd 100644 --- a/lapack/tpls/KokkosLapack_tpl_spec.hpp +++ b/lapack/tpls/KokkosLapack_tpl_spec.hpp @@ -34,15 +34,16 @@ struct CudaLapackSingleton { }; inline void cusolver_internal_error_throw(cusolverStatus_t cusolverState, - const char* name, const char* file, - const int line) { + const char* name, const char* file, + const int line) { std::ostringstream out; // out << name << " error( " << cusolverGetStatusName(cusolverState) // << "): " << cusolverGetStatusString(cusolverState); out << name << " error( "; switch (cusolverState) { case CUSOLVER_STATUS_NOT_INITIALIZED: - out << "CUSOLVER_STATUS_NOT_INITIALIZED): the library was not initialized."; + out << "CUSOLVER_STATUS_NOT_INITIALIZED): the library was not " + "initialized."; break; case CUSOLVER_STATUS_ALLOC_FAILED: out << "CUSOLVER_STATUS_ALLOC_FAILED): the resource allocation failed."; @@ -79,9 +80,9 @@ inline void cusolver_internal_error_throw(cusolverStatus_t cusolverState, } inline void cusolver_internal_safe_call(cusolverStatus_t cusolverState, - const char* name, - const char* file = nullptr, - const int line = 0) { + const char* name, + const char* file = nullptr, + const int line = 0) { if (CUSOLVER_STATUS_SUCCESS != cusolverState) { cusolver_internal_error_throw(cusolverState, name, file, line); } @@ -90,8 +91,9 @@ inline void cusolver_internal_safe_call(cusolverStatus_t cusolverState, // The macro below defines the interface for the safe cusolver calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, \ + __LINE__) /// \brief This function converts KK transpose mode to cusolver transpose mode inline cublasOperation_t trans_mode_kk_to_cusolver(const char kkMode[]) { @@ -124,13 +126,14 @@ struct RocsolverSingleton { }; inline void rocsolver_internal_error_throw(rocsolver_status rocsolverState, - const char* name, const char* file, - const int line) { + const char* name, const char* file, + const int line) { std::ostringstream out; out << name << " error( "; switch (rocsolverState) { case rocsolver_status_invalid_handle: - out << "rocsolver_status_invalid_handle): handle not initialized, invalid " + out << "rocsolver_status_invalid_handle): handle not initialized, " + "invalid " "or null."; break; case rocsolver_status_not_implemented: @@ -143,11 +146,13 @@ inline void rocsolver_internal_error_throw(rocsolver_status rocsolverState, out << "rocsolver_status_invalid_size): invalid size argument."; break; case rocsolver_status_memory_error: - out << "rocsolver_status_memory_error): failed internal memory allocation, " + out << "rocsolver_status_memory_error): failed internal memory " + "allocation, " "copy or dealloc."; break; case rocsolver_status_internal_error: - out << "rocsolver_status_internal_error): other internal library failure."; + out << "rocsolver_status_internal_error): other internal library " + "failure."; break; case rocsolver_status_perf_degraded: out << "rocsolver_status_perf_degraded): performance degraded due to low " @@ -184,9 +189,9 @@ inline void rocsolver_internal_error_throw(rocsolver_status rocsolverState, } inline void rocsolver_internal_safe_call(rocsolver_status rocsolverState, - const char* name, - const char* file = nullptr, - const int line = 0) { + const char* name, + const char* file = nullptr, + const int line = 0) { if (rocsolver_status_success != rocsolverState) { rocsolver_internal_error_throw(rocsolverState, name, file, line); } @@ -195,8 +200,9 @@ inline void rocsolver_internal_safe_call(rocsolver_status rocsolverState, // The macro below defines the interface for the safe rocsolver calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_ROCSOLVER_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::rocsolver_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOS_ROCSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::rocsolver_internal_safe_call(call, #call, __FILE__, \ + __LINE__) /// \brief This function converts KK transpose mode to rocsolver transpose mode inline rocsolver_operation trans_mode_kk_to_rocsolver(const char kkMode[]) { diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp index d723cef260..e9fe689fef 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp @@ -27,7 +27,7 @@ struct trtri_tpl_spec_avail { }; // Generic Host side LAPACK (could be MKL or whatever) -#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ template \ struct trtri_tpl_spec_avail< \ Kokkos::View, - Kokkos::LayoutLeft, Kokkos::HostSpace) + Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, - Kokkos::HostSpace) + Kokkos::HostSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaSpace) + Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) + Kokkos::CudaUVMSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, - Kokkos::HostSpace) + Kokkos::HostSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaSpace) + Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) + Kokkos::CudaUVMSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) + Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) + Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 9f79ad2eb5..32e2434a86 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -24,8 +24,8 @@ namespace KokkosLapack { namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ + MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRTRI >, \ @@ -44,8 +44,8 @@ namespace Impl { \ static void trtri(const RViewType& R, const char uplo[], \ const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE \ - "]"); \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ const int M = static_cast(A.extent(0)); \ \ bool A_is_layout_left = \ @@ -61,7 +61,7 @@ namespace Impl { else \ uplo_ = A_is_layout_left ? 'U' : 'L'; \ \ - R() = HostLapack::trtri( \ + R() = HostLapack::trtri( \ uplo_, diag[0], M, \ reinterpret_cast(A.data()), LDA); \ Kokkos::Profiling::popRegion(); \ @@ -69,65 +69,67 @@ namespace Impl { }; #else #define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) + MEM_SPACE, ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ - LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRTRI >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ - RViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void trtri(const RViewType& R, const char uplo[], \ - const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE \ - "]"); \ - magma_int_t M = static_cast(A.extent(0)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - \ - magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - magma_int_t info = 0; \ - magma_uplo_t uplo_; \ - magma_diag_t diag_; \ - \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ - else \ - uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ - \ - if (diag[0] == 'U' || diag[0] == 'u') \ - diag_ = MagmaUnit; \ - else \ - diag_ = MagmaNonUnit; \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - R() = MAGMA_FN(uplo_, diag_, M, \ - reinterpret_cast( \ - const_cast(A.data())), \ - LDA, &info); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ + MAGMA_FN, LAYOUTA, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct TRTRI >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View > \ + RViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void trtri(const RViewType& R, const char uplo[], \ + const char diag[], const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ + magma_int_t M = static_cast(A.extent(0)); \ + \ + bool A_is_layout_left = \ + std::is_same::value; \ + \ + magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ + LDA = (AST == 0) ? 1 : AST; \ + magma_int_t info = 0; \ + magma_uplo_t uplo_; \ + magma_diag_t diag_; \ + \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ + else \ + uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ + \ + if (diag[0] == 'U' || diag[0] == 'u') \ + diag_ = MagmaUnit; \ + else \ + diag_ = MagmaNonUnit; \ + \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ + R() = MAGMA_FN(uplo_, diag_, M, \ + reinterpret_cast( \ + const_cast(A.data())), \ + LDA, &info); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #else -#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ - LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ + MAGMA_FN, LAYOUTA, MEM_SPACE, \ + ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA // Explicitly define the TRTRI class for all permutations listed below @@ -135,39 +137,42 @@ namespace Impl { // Handle type and space permutations #define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ + ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + LAYOUTA, Kokkos::CudaUVMSpace, \ + ETI_SPEC_AVAIL) #define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ + ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) - -#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ - magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ - magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ - ETI_SPEC_AVAIL) - -#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ - magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ - magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ - ETI_SPEC_AVAIL) + LAYOUTA, Kokkos::CudaUVMSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ + std::complex, LAYOUTA, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ + magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ + Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ + LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ + magmaFloatComplex_ptr, magma_ctrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ + Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) // Handle layout permutations KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, true) diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index f37770c812..06f51b7eb0 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -16,10 +16,11 @@ // only enable this test where KokkosLapack supports gesv: // CUDA+MAGMA and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || \ + defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include @@ -96,8 +97,8 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // and no-tpl case bool nopivot_runtime_err = false; bool notpl_runtime_err = false; -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -105,7 +106,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { #else notpl_runtime_err = true; #endif -#else // not have MAGMA TPL +#else // not have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; @@ -201,8 +202,8 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, // and no-tpl case bool nopivot_runtime_err = false; bool notpl_runtime_err = false; -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -210,7 +211,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, #else notpl_runtime_err = true; #endif -#else // not have MAGMA TPL +#else // not have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; @@ -387,8 +388,7 @@ TEST_F(TestCategory, gesv_complex_double) { TEST_F(TestCategory, gesv_mrhs_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_double"); test_gesv_mrhs, TestDevice>("N"); // No pivoting - test_gesv_mrhs, TestDevice>( - "Y"); // Partial pivoting + test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -406,8 +406,7 @@ TEST_F(TestCategory, gesv_complex_float) { TEST_F(TestCategory, gesv_mrhs_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_float"); test_gesv_mrhs, TestDevice>("N"); // No pivoting - test_gesv_mrhs, TestDevice>( - "Y"); // Partial pivoting + test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif diff --git a/lapack/unit_test/Test_Lapack_trtri.hpp b/lapack/unit_test/Test_Lapack_trtri.hpp index 0105803567..a19e575d89 100644 --- a/lapack/unit_test/Test_Lapack_trtri.hpp +++ b/lapack/unit_test/Test_Lapack_trtri.hpp @@ -118,8 +118,8 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, // const int As0 = A.stride(0), As1 = A.stride(1); // const int Ae0 = A.extent(0), Ae1 = A.extent(1); - // printf("KokkosLapack::trtri test for %c %c, M %d, N %d, eps %g, ViewType: %s, - // A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d + // printf("KokkosLapack::trtri test for %c %c, M %d, N %d, eps %g, ViewType: + // %s, A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d // START\n", uplo[0],diag[0],M,N,eps,typeid(ViewTypeA).name(), As0, As1, Ae0, // Ae1); fflush(stdout); From f8cd2cb8aa6c2f53a422e17bcdfb07d4f82f220f Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 25 Oct 2023 00:07:10 -0600 Subject: [PATCH 046/326] Small cleaning --- cmake/kokkoskernels_tpls.cmake | 14 -------------- lapack/tpls/KokkosLapack_Host_tpl.cpp | 1 - 2 files changed, 15 deletions(-) diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index be1488e051..f650168757 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -440,20 +440,6 @@ IF ("${F77_BLAS_MANGLE}" STREQUAL "") ENDIF() ENDIF() -# AquiEEP -IF ("${F77_LAPACK_MANGLE}" STREQUAL "") - IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) - ENABLE_LANGUAGE(C) - ENABLE_LANGUAGE(Fortran) - INCLUDE(FortranCInterface) - IF (FortranCInterface_GLOBAL_SUFFIX STREQUAL "") - SET(F77_LAPACK_MANGLE "(name,NAME) ${FortranCInterface_GLOBAL_PREFIX}name") - ELSE () - SET(F77_LAPACK_MANGLE "(name,NAME) ${FortranCInterface_GLOBAL_PREFIX}name ## ${FortranCInterface_GLOBAL_SUFFIX}") - ENDIF () - ENDIF() -ENDIF() - KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_CUDA_TPLS OFF BOOL "Whether CUDA TPLs should be enabled by default. Default: OFF") SET(CUBLAS_DEFAULT ${KOKKOS_ENABLE_CUDA}) SET(CUSPARSE_DEFAULT ${KOKKOS_ENABLE_CUDA}) diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp index 130eaba264..d629a17f1d 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -29,7 +29,6 @@ extern "C" { /// Gesv /// -// AquiEEP void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, int*); void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, From edf2dd011264da42017fb217857aa8052af76968 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 25 Oct 2023 02:16:37 -0600 Subject: [PATCH 047/326] Correcting error in Jenkins --- perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp index cbadcef0b1..de2db8dbb0 100644 --- a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp @@ -21,7 +21,7 @@ #include -#include +#include #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" @@ -185,7 +185,7 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { for (int i = 0; i < options.start.a.k; ++i) { auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosLapack::trtri(&trtri_args.uplo, &trtri_args.diag, A); } // Fence after each batch operation Kokkos::fence(); @@ -196,7 +196,7 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { for (int i = 0; i < options.start.a.k; ++i) { auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosLapack::trtri(&trtri_args.uplo, &trtri_args.diag, A); } // Fence after each batch operation Kokkos::fence(); @@ -300,7 +300,7 @@ struct parallel_blas_trtri { void operator()(const int& i) const { auto svA = Kokkos::subview(trtri_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); + KokkosLapack::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); } }; #endif // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP && From 915766579c33a179208158a15b89aa632ecba1f8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 25 Oct 2023 04:13:46 -0600 Subject: [PATCH 048/326] Fixing compilation error on Jenkins when dealing with HIP --- lapack/unit_test/backends/Test_HIP_Lapack.cpp | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 lapack/unit_test/backends/Test_HIP_Lapack.cpp diff --git a/lapack/unit_test/backends/Test_HIP_Lapack.cpp b/lapack/unit_test/backends/Test_HIP_Lapack.cpp new file mode 100644 index 0000000000..c0ec152233 --- /dev/null +++ b/lapack/unit_test/backends/Test_HIP_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_HIP_LAPACK_CPP +#define TEST_HIP_LAPACK_CPP + +#include "Test_HIP.hpp" +#include "Test_Lapack.hpp" + +#endif // TEST_HIP_LAPACK_CPP From b1d77bd190115deb10e304f515365b32ba818bc5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 Oct 2023 07:05:23 -0600 Subject: [PATCH 049/326] Add required rtd conf file --- .readthedocs.yaml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000000..519282a179 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file From 93211aa15570a39db5f4fd6330325a0f4bfaa775 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 Oct 2023 07:07:55 -0600 Subject: [PATCH 050/326] README.md: Use correct project slug --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0da1057870..bdad1442ce 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Generic badge](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](https://kokkos-kernels.readthedocs.io/en/latest/) +[![Generic badge](https://readthedocs.org/projects/kokkos-kernels/badge/?version=latest)](https://kokkos-kernels.readthedocs.io/en/latest/) ![KokkosKernels](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) From ec8a919533e947c2ad4fc2f84420e7c4ab2c8b39 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 Oct 2023 07:15:33 -0600 Subject: [PATCH 051/326] docs/requirements.txt: Add sphinx-rtd-theme --- docs/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 188f51e62d..75f092707b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,2 @@ -breathe \ No newline at end of file +breathe +sphinx-rtd-theme \ No newline at end of file From 699f3b3f54ee57bafd200709b36a42c461708eaa Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 25 Oct 2023 12:42:58 -0600 Subject: [PATCH 052/326] Addressing latest feedbacks from Luc. --- cmake/KokkosKernels_config.h.in | 2 +- lapack/tpls/KokkosLapack_tpl_spec.hpp | 241 ------------------ .../KokkosLapack_trtri_tpl_spec_avail.hpp | 22 ++ .../tpls/KokkosLapack_trtri_tpl_spec_decl.hpp | 27 +- 4 files changed, 48 insertions(+), 244 deletions(-) delete mode 100644 lapack/tpls/KokkosLapack_tpl_spec.hpp diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 4c54a350b3..7a61771231 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -109,7 +109,7 @@ /* BLAS library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_BLAS -/* LAPACKE */ +/* LAPACK */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL diff --git a/lapack/tpls/KokkosLapack_tpl_spec.hpp b/lapack/tpls/KokkosLapack_tpl_spec.hpp deleted file mode 100644 index 3aed9533bd..0000000000 --- a/lapack/tpls/KokkosLapack_tpl_spec.hpp +++ /dev/null @@ -1,241 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSLAPACK_TPL_SPEC_HPP_ -#define KOKKOSLAPACK_TPL_SPEC_HPP_ - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER -#include "cuda_runtime.h" -//#include "cublas_v2.h" -#include "cusolverDn.h" - -namespace KokkosLapack { -namespace Impl { - -struct CudaLapackSingleton { - cusolverDnHandle_t handle; - - CudaLapackSingleton(); - - static CudaLapackSingleton& singleton(); -}; - -inline void cusolver_internal_error_throw(cusolverStatus_t cusolverState, - const char* name, const char* file, - const int line) { - std::ostringstream out; - // out << name << " error( " << cusolverGetStatusName(cusolverState) - // << "): " << cusolverGetStatusString(cusolverState); - out << name << " error( "; - switch (cusolverState) { - case CUSOLVER_STATUS_NOT_INITIALIZED: - out << "CUSOLVER_STATUS_NOT_INITIALIZED): the library was not " - "initialized."; - break; - case CUSOLVER_STATUS_ALLOC_FAILED: - out << "CUSOLVER_STATUS_ALLOC_FAILED): the resource allocation failed."; - break; - case CUSOLVER_STATUS_INVALID_VALUE: - out << "CUSOLVER_STATUS_INVALID_VALUE): an invalid numerical value was " - "used as an argument."; - break; - case CUSOLVER_STATUS_ARCH_MISMATCH: - out << "CUSOLVER_STATUS_ARCH_MISMATCH): an absent device architectural " - "feature is required."; - break; - case CUSOLVER_STATUS_MAPPING_ERROR: - out << "CUSOLVER_STATUS_MAPPING_ERROR): an access to GPU memory space " - "failed."; - break; - case CUSOLVER_STATUS_EXECUTION_FAILED: - out << "CUSOLVER_STATUS_EXECUTION_FAILED): the GPU program failed to " - "execute."; - break; - case CUSOLVER_STATUS_INTERNAL_ERROR: - out << "CUSOLVER_STATUS_INTERNAL_ERROR): an internal operation failed."; - break; - case CUSOLVER_STATUS_NOT_SUPPORTED: - out << "CUSOLVER_STATUS_NOT_SUPPORTED): the feature required is not " - "supported."; - break; - default: out << "unrecognized error code): this is bad!"; break; - } - if (file) { - out << " " << file << ":" << line; - } - throw std::runtime_error(out.str()); -} - -inline void cusolver_internal_safe_call(cusolverStatus_t cusolverState, - const char* name, - const char* file = nullptr, - const int line = 0) { - if (CUSOLVER_STATUS_SUCCESS != cusolverState) { - cusolver_internal_error_throw(cusolverState, name, file, line); - } -} - -// The macro below defines the interface for the safe cusolver calls. -// The functions themselves are protected by impl namespace and this -// is not meant to be used by external application or libraries. -#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, \ - __LINE__) - -/// \brief This function converts KK transpose mode to cusolver transpose mode -inline cublasOperation_t trans_mode_kk_to_cusolver(const char kkMode[]) { - cublasOperation_t trans; - if ((kkMode[0] == 'N') || (kkMode[0] == 'n')) - trans = CUBLAS_OP_N; - else if ((kkMode[0] == 'T') || (kkMode[0] == 't')) - trans = CUBLAS_OP_T; - else - trans = CUBLAS_OP_C; - return trans; -} - -} // namespace Impl -} // namespace KokkosLapack -#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER -#include - -namespace KokkosLapack { -namespace Impl { - -struct RocsolverSingleton { - rocsolver_handle handle; - - RocsolverSingleton(); - - static RocsolverSingleton& singleton(); -}; - -inline void rocsolver_internal_error_throw(rocsolver_status rocsolverState, - const char* name, const char* file, - const int line) { - std::ostringstream out; - out << name << " error( "; - switch (rocsolverState) { - case rocsolver_status_invalid_handle: - out << "rocsolver_status_invalid_handle): handle not initialized, " - "invalid " - "or null."; - break; - case rocsolver_status_not_implemented: - out << "rocsolver_status_not_implemented): function is not implemented."; - break; - case rocsolver_status_invalid_pointer: - out << "rocsolver_status_invalid_pointer): invalid pointer argument."; - break; - case rocsolver_status_invalid_size: - out << "rocsolver_status_invalid_size): invalid size argument."; - break; - case rocsolver_status_memory_error: - out << "rocsolver_status_memory_error): failed internal memory " - "allocation, " - "copy or dealloc."; - break; - case rocsolver_status_internal_error: - out << "rocsolver_status_internal_error): other internal library " - "failure."; - break; - case rocsolver_status_perf_degraded: - out << "rocsolver_status_perf_degraded): performance degraded due to low " - "device memory."; - break; - case rocsolver_status_size_query_mismatch: - out << "unmatched start/stop size query): ."; - break; - case rocsolver_status_size_increased: - out << "rocsolver_status_size_increased): queried device memory size " - "increased."; - break; - case rocsolver_status_size_unchanged: - out << "rocsolver_status_size_unchanged): queried device memory size " - "unchanged."; - break; - case rocsolver_status_invalid_value: - out << "rocsolver_status_invalid_value): passed argument not valid."; - break; - case rocsolver_status_continue: - out << "rocsolver_status_continue): nothing preventing function to " - "proceed."; - break; - case rocsolver_status_check_numerics_fail: - out << "rocsolver_status_check_numerics_fail): will be set if the " - "vector/matrix has a NaN or an Infinity."; - break; - default: out << "unrecognized error code): this is bad!"; break; - } - if (file) { - out << " " << file << ":" << line; - } - throw std::runtime_error(out.str()); -} - -inline void rocsolver_internal_safe_call(rocsolver_status rocsolverState, - const char* name, - const char* file = nullptr, - const int line = 0) { - if (rocsolver_status_success != rocsolverState) { - rocsolver_internal_error_throw(rocsolverState, name, file, line); - } -} - -// The macro below defines the interface for the safe rocsolver calls. -// The functions themselves are protected by impl namespace and this -// is not meant to be used by external application or libraries. -#define KOKKOS_ROCSOLVER_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::rocsolver_internal_safe_call(call, #call, __FILE__, \ - __LINE__) - -/// \brief This function converts KK transpose mode to rocsolver transpose mode -inline rocsolver_operation trans_mode_kk_to_rocsolver(const char kkMode[]) { - rocsolver_operation trans; - if ((kkMode[0] == 'N') || (kkMode[0] == 'n')) - trans = rocsolver_operation_none; - else if ((kkMode[0] == 'T') || (kkMode[0] == 't')) - trans = rocsolver_operation_transpose; - else - trans = rocsolver_operation_conjugate_transpose; - return trans; -} - -} // namespace Impl -} // namespace KokkosLapack - -#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER - -// If LAPACK TPL is enabled, it is preferred over magma's LAPACK -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include "magma_v2.h" - -namespace KokkosLapack { -namespace Impl { - -struct MagmaSingleton { - MagmaSingleton(); - - static MagmaSingleton& singleton(); -}; - -} // namespace Impl -} // namespace KokkosLapack -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA - -#endif // KOKKOSLAPACK_TPL_SPEC_HPP_ diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp index e9fe689fef..7251d97086 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp @@ -53,57 +53,79 @@ struct trtri_tpl_spec_avail { KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +#endif + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +#endif + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +#endif + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +#endif KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +#endif + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +#endif + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +#endif + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +#endif } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 32e2434a86..6aadf1ad72 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -18,7 +18,7 @@ #define KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ #include "KokkosLapack_Host_tpl.hpp" // trtri prototype -#include "KokkosLapack_tpl_spec.hpp" +//#include "KokkosLapack_tpl_spec.hpp" namespace KokkosLapack { namespace Impl { @@ -135,7 +135,9 @@ namespace Impl { // Explicitly define the TRTRI class for all permutations listed below // Handle type and space permutations -#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ +#ifdef KOKKOS_ENABLE_CUDA + +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ @@ -174,6 +176,27 @@ namespace Impl { Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, \ LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) +#else + +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ + std::complex, LAYOUTA, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ + LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) + +#endif + // Handle layout permutations KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, true) KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, false) From d674964889aec3b47c7252be0bdc41ae31fc7423 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 25 Oct 2023 12:53:05 -0600 Subject: [PATCH 053/326] Formatting --- lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 6aadf1ad72..3ed0623018 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -137,7 +137,7 @@ namespace Impl { // Handle type and space permutations #ifdef KOKKOS_ENABLE_CUDA -#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ ETI_SPEC_AVAIL) \ KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ @@ -178,17 +178,17 @@ namespace Impl { #else -#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ - std::complex, LAYOUTA, \ +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ + std::complex, LAYOUTA, \ Kokkos::HostSpace, ETI_SPEC_AVAIL) #define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ From 851358f3d7ee601c89dd2aad0a376ab13efe7ea9 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 25 Oct 2023 14:11:39 -0600 Subject: [PATCH 054/326] KokkosKernelsConfig.cmake: add all_libs target and necessary aliases * Intent of these changes is to allow for building Trilinos with KokkosKernels as an external TPL --- cmake/KokkosKernelsConfig.cmake.in | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index fbceffe76c..9b649d26c6 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -11,3 +11,13 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) INCLUDE("${KokkosKernels_CMAKE_DIR}/KokkosKernelsTargets.cmake") +IF(NOT TARGET KokkosKernels::all_libs) + # CMake Error at /lib/cmake/Kokkos/KokkosConfigCommon.cmake:10 (ADD_LIBRARY): + # ADD_LIBRARY cannot create ALIAS target "Kokkos::all_libs" because target + # "KokkosKernels::kokkoskernels" is imported but not globally visible. + IF(CMAKE_VERSION VERSION_LESS "3.18") + SET_TARGET_PROPERTIES(Kokkos::kokkoskernels PROPERTIES IMPORTED_GLOBAL ON) + ENDIF() + ADD_LIBRARY(KokkosKernels::all_libs ALIAS Kokkos::kokkoskernels) + ADD_LIBRARY(KokkosKernels::kokkoskernels ALIAS Kokkos::kokkoskernels) +ENDIF() From 252c9db58e3469eb6dadb333c17f7fcbd827b039 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 26 Oct 2023 14:02:51 -0600 Subject: [PATCH 055/326] hide native merge-path SpMV behind "native-merge" --- sparse/impl/KokkosSparse_spmv_impl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 060a9d66c7..4f90002a61 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -31,7 +31,7 @@ namespace KokkosSparse { namespace Impl { -constexpr const char* KOKKOSSPARSE_ALG_MERGE = "merge"; +constexpr const char* KOKKOSSPARSE_ALG_NATIVE_MERGE = "native-merge"; // This TransposeFunctor is functional, but not necessarily performant. template ::spmv( exec, mode, alpha, A, x, beta, y); } else { @@ -640,7 +640,7 @@ static void spmv_beta(const execution_space& exec, false>(exec, controls, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { - if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { From 8ba47b90dc10440bbc9edec2ecd4ece8cad1182f Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 27 Oct 2023 10:15:12 -0600 Subject: [PATCH 056/326] test native-merge algorithm --- sparse/unit_test/Test_Sparse_spmv.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 8fdb56b5f4..990fcc1a30 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -549,6 +549,12 @@ void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth, test_spmv( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } + { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "native-merge"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } } template Date: Fri, 27 Oct 2023 17:07:31 -0600 Subject: [PATCH 057/326] Quick fix for night compilation with Trilinos --- .../backends/Test_OpenMPTarget_Lapack.cpp | 22 +++++++++++++++++++ .../unit_test/backends/Test_SYCL_Lapack.cpp | 22 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp create mode 100644 lapack/unit_test/backends/Test_SYCL_Lapack.cpp diff --git a/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp b/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp new file mode 100644 index 0000000000..5191918ce9 --- /dev/null +++ b/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMPTARGET_LAPACK_CPP +#define TEST_OPENMPTARGET_LAPACK_CPP + +#include "Test_OpenMPTarget.hpp" +#include "Test_Lapack.hpp" + +#endif // TEST_OPENMPTARGET_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_SYCL_Lapack.cpp b/lapack/unit_test/backends/Test_SYCL_Lapack.cpp new file mode 100644 index 0000000000..9485f2a420 --- /dev/null +++ b/lapack/unit_test/backends/Test_SYCL_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SYCL_LAPACK_CPP +#define TEST_SYCL_LAPACK_CPP + +#include +#include + +#endif // TEST_SYCL_LAPACK_CPP From 96e7fb5c5459c6d7303710198b78b3b2508933c1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Oct 2023 08:55:25 -0600 Subject: [PATCH 058/326] SPTRSV: check if cusparse is available before calling TPL path Since SpTRSV does not implement the TPL layer the usual way we need to be extra careful before calling the TPL implementation path. If cusparse is not available then we definitely want to revert back to calling the native implementation. Similarly, if the execution space is not Kokkos::CUDA, let's use the native implementation. --- sparse/src/KokkosSparse_sptrsv.hpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 9ab7c9fc6a..e8fd8ea26c 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -220,6 +220,8 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE RowMap_Internal rowmap_i = rowmap; Entries_Internal entries_i = entries; Values_Internal values_i = values; @@ -233,6 +235,9 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, false); +#else // We better go to the native implementation + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); +#endif } else { KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } @@ -392,6 +397,7 @@ void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; sptrsvHandleType *sh = handle->get_sptrsv_handle(); auto nrows = sh->get_nrows(); @@ -400,7 +406,13 @@ void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>( space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); - +#else + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, + values_i, b_i, x_i); +#endif } else { KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, @@ -748,13 +760,21 @@ void sptrsv_solve_streams(const std::vector &execspace_v, if (handle_v[0]->get_sptrsv_handle()->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // NOTE: assume all streams use the same SPTRSV_CUSPARSE algo. KokkosSparse::Impl::sptrsvcuSPARSE_solve_streams< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>( execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, x_i_v, false); - +#else + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, + rowmap_i_v, entries_i_v, + values_i_v, b_i_v, x_i_v); +#endif } else { KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, From 9107b3e81111b183cf2295955b82da72c340ea84 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Oct 2023 09:28:44 -0600 Subject: [PATCH 059/326] SpTRSV: more strickly check prerequisites in SptrsvHandle Check that CUSPARSE is enabled and that HandleExecSpace is Kokkos::CUDA before allowing users to set the implementation to use the CUSPARSE TPL. --- sparse/src/KokkosSparse_sptrsv_handle.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index 7c9027d24a..e110734045 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -476,6 +476,18 @@ class SPTRSVHandle { this->set_if_algm_require_symb_lvlsched(); this->set_if_algm_require_symb_chain(); + // Check a few prerequisites before allowing users + // to run with the cusparse implementation of sptrsv. + if(algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE) { +#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + throw(std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but cuSPARSE TPL not enabled.")); +#else + if(!std::is_same_v) { + throw(std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but HandleExecSpace is not Kokkos::CUDA.")); + } +#endif + } + #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV if (lower_tri) { // lower-triangular is stored in CSC From 9408e49fa2f36455586efb0b18b4b6ee84cf9679 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Oct 2023 10:16:22 -0600 Subject: [PATCH 060/326] SpTRSV: fix some type definition and variable usaged for cuSPARSE Since we are guarding the cusparse path a bit better we need to be careful when some types are defined and to mark some variables (void) when they do not get used by an implementation... --- sparse/src/KokkosSparse_sptrsv.hpp | 33 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index e8fd8ea26c..2fa021838c 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -190,38 +190,35 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, const_handle_type; const_handle_type tmp_handle(*handle); - typedef Kokkos::View< +#ifdef KK_TRISOLVE_TIMERS + Kokkos::Timer timer_sptrsv; +#endif + auto sptrsv_handle = handle->get_sptrsv_handle(); + if (sptrsv_handle->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + using RowMap_Internal = Kokkos::View< typename lno_row_view_t_::const_value_type *, typename KokkosKernels::Impl::GetUnifiedLayout< lno_row_view_t_>::array_layout, typename lno_row_view_t_::device_type, - Kokkos::MemoryTraits > - RowMap_Internal; + Kokkos::MemoryTraits >; - typedef Kokkos::View< + using Entries_Internal = Kokkos::View< typename lno_nnz_view_t_::const_value_type *, typename KokkosKernels::Impl::GetUnifiedLayout< lno_nnz_view_t_>::array_layout, typename lno_nnz_view_t_::device_type, - Kokkos::MemoryTraits > - Entries_Internal; + Kokkos::MemoryTraits >; - typedef Kokkos::View< + using Values_Internal = Kokkos::View< typename scalar_nnz_view_t_::const_value_type *, typename KokkosKernels::Impl::GetUnifiedLayout< scalar_nnz_view_t_>::array_layout, typename scalar_nnz_view_t_::device_type, - Kokkos::MemoryTraits > - Values_Internal; - -#ifdef KK_TRISOLVE_TIMERS - Kokkos::Timer timer_sptrsv; -#endif - auto sptrsv_handle = handle->get_sptrsv_handle(); - if (sptrsv_handle->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + Kokkos::MemoryTraits >; -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE RowMap_Internal rowmap_i = rowmap; Entries_Internal entries_i = entries; Values_Internal values_i = values; @@ -236,9 +233,11 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, false); #else // We better go to the native implementation + (void) values; KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); #endif } else { + (void) values; KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } #ifdef KK_TRISOLVE_TIMERS From daee1b6cf8f62f9101b0514fa9d60476415fc2e0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Oct 2023 10:18:42 -0600 Subject: [PATCH 061/326] SpTRSV: applying clang-format --- sparse/src/KokkosSparse_sptrsv.hpp | 37 +++++++++++------------ sparse/src/KokkosSparse_sptrsv_handle.hpp | 12 +++++--- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 2fa021838c..6b3d452ec0 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -196,28 +196,27 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { - #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE using RowMap_Internal = Kokkos::View< - typename lno_row_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - lno_row_view_t_>::array_layout, - typename lno_row_view_t_::device_type, - Kokkos::MemoryTraits >; + typename lno_row_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_row_view_t_>::array_layout, + typename lno_row_view_t_::device_type, + Kokkos::MemoryTraits >; using Entries_Internal = Kokkos::View< - typename lno_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - lno_nnz_view_t_>::array_layout, - typename lno_nnz_view_t_::device_type, - Kokkos::MemoryTraits >; + typename lno_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_nnz_view_t_>::array_layout, + typename lno_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; using Values_Internal = Kokkos::View< - typename scalar_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - scalar_nnz_view_t_>::array_layout, - typename scalar_nnz_view_t_::device_type, - Kokkos::MemoryTraits >; + typename scalar_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + scalar_nnz_view_t_>::array_layout, + typename scalar_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; RowMap_Internal rowmap_i = rowmap; Entries_Internal entries_i = entries; @@ -232,12 +231,12 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, false); -#else // We better go to the native implementation - (void) values; +#else // We better go to the native implementation + (void)values; KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); #endif } else { - (void) values; + (void)values; KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } #ifdef KK_TRISOLVE_TIMERS diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index e110734045..4543ef65a7 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -478,12 +478,16 @@ class SPTRSVHandle { // Check a few prerequisites before allowing users // to run with the cusparse implementation of sptrsv. - if(algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + if (algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE) { #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - throw(std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but cuSPARSE TPL not enabled.")); + throw( + std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but " + "cuSPARSE TPL not enabled.")); #else - if(!std::is_same_v) { - throw(std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but HandleExecSpace is not Kokkos::CUDA.")); + if (!std::is_same_v) { + throw( + std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but " + "HandleExecSpace is not Kokkos::CUDA.")); } #endif } From e88c418c438163248d9dd81eb121d2a52fa60462 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Oct 2023 18:05:56 -0600 Subject: [PATCH 062/326] SpTRSV: more fixes --- sparse/src/KokkosSparse_sptrsv.hpp | 36 ++++++++++++++++++----- sparse/src/KokkosSparse_sptrsv_handle.hpp | 2 +- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 6b3d452ec0..7032366be2 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -161,6 +161,9 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; + static_assert(std::is_same_v, + "sptrsv_symbolic: ExecutionSpace and HandleExecSpace need to match!"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( typename lno_row_view_t_::non_const_value_type, size_type), "sptrsv_symbolic: A size_type must match KernelHandle " @@ -197,6 +200,7 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if constexpr (std::is_same_v) { using RowMap_Internal = Kokkos::View< typename lno_row_view_t_::const_value_type *, typename KokkosKernels::Impl::GetUnifiedLayout< @@ -230,6 +234,10 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, false); + } else { + (void)values; + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); + } #else // We better go to the native implementation (void)values; @@ -263,6 +271,7 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { using ExecutionSpace = typename KernelHandle::HandleExecSpace; auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries, values); } @@ -295,6 +304,9 @@ void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; + static_assert(std::is_same_v, + "sptrsv solve: ExecutionSpace and HandleExecSpace need to match"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( typename lno_row_view_t_::non_const_value_type, size_type), "sptrsv_solve: A size_type must match KernelHandle size_type " @@ -396,14 +408,22 @@ void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; - sptrsvHandleType *sh = handle->get_sptrsv_handle(); - auto nrows = sh->get_nrows(); - - KokkosSparse::Impl::sptrsvcuSPARSE_solve< - ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, - Values_Internal, BType_Internal, XType_Internal>( - space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + if constexpr (std::is_same_v) { + typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; + sptrsvHandleType *sh = handle->get_sptrsv_handle(); + auto nrows = sh->get_nrows(); + + KokkosSparse::Impl::sptrsvcuSPARSE_solve< + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>( + space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + } else { + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, + values_i, b_i, x_i); + } #else KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index 4543ef65a7..cf23bfdc1f 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -484,7 +484,7 @@ class SPTRSVHandle { std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but " "cuSPARSE TPL not enabled.")); #else - if (!std::is_same_v) { + if (!std::is_same_v) { throw( std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but " "HandleExecSpace is not Kokkos::CUDA.")); From ef6f19ee13b496a742e671e0352f53ccfe546b99 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Oct 2023 18:07:27 -0600 Subject: [PATCH 063/326] SpTRSV: apply clang-format --- sparse/src/KokkosSparse_sptrsv.hpp | 83 ++++++++++++++++-------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 7032366be2..1fef3e9f1b 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -161,8 +161,9 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; - static_assert(std::is_same_v, - "sptrsv_symbolic: ExecutionSpace and HandleExecSpace need to match!"); + static_assert( + std::is_same_v, + "sptrsv_symbolic: ExecutionSpace and HandleExecSpace need to match!"); static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( typename lno_row_view_t_::non_const_value_type, size_type), @@ -201,42 +202,43 @@ void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE if constexpr (std::is_same_v) { - using RowMap_Internal = Kokkos::View< - typename lno_row_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - lno_row_view_t_>::array_layout, - typename lno_row_view_t_::device_type, - Kokkos::MemoryTraits >; - - using Entries_Internal = Kokkos::View< - typename lno_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - lno_nnz_view_t_>::array_layout, - typename lno_nnz_view_t_::device_type, - Kokkos::MemoryTraits >; - - using Values_Internal = Kokkos::View< - typename scalar_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - scalar_nnz_view_t_>::array_layout, - typename scalar_nnz_view_t_::device_type, - Kokkos::MemoryTraits >; - - RowMap_Internal rowmap_i = rowmap; - Entries_Internal entries_i = entries; - Values_Internal values_i = values; - - typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; - sptrsvHandleType *sh = handle->get_sptrsv_handle(); - auto nrows = sh->get_nrows(); - - KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< - ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, - Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, - false); + using RowMap_Internal = Kokkos::View< + typename lno_row_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_row_view_t_>::array_layout, + typename lno_row_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Entries_Internal = Kokkos::View< + typename lno_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_nnz_view_t_>::array_layout, + typename lno_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Values_Internal = Kokkos::View< + typename scalar_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + scalar_nnz_view_t_>::array_layout, + typename scalar_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + RowMap_Internal rowmap_i = rowmap; + Entries_Internal entries_i = entries; + Values_Internal values_i = values; + + typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; + sptrsvHandleType *sh = handle->get_sptrsv_handle(); + auto nrows = sh->get_nrows(); + + KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, + false); } else { (void)values; - KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, + entries); } #else // We better go to the native implementation @@ -304,8 +306,9 @@ void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; - static_assert(std::is_same_v, - "sptrsv solve: ExecutionSpace and HandleExecSpace need to match"); + static_assert( + std::is_same_v, + "sptrsv solve: ExecutionSpace and HandleExecSpace need to match"); static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( typename lno_row_view_t_::non_const_value_type, size_type), @@ -417,13 +420,13 @@ void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>( space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); - } else { + } else { KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i); - } + } #else KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, From f0d748399c6ea2c479307be3a6c4dc8c06a40ed9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Sun, 5 Nov 2023 13:04:37 -0700 Subject: [PATCH 064/326] SYCL: fix for Trilinos build with MKL --- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 4 +++- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 6 ++++-- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 4 +++- cmake/KokkosKernels_config.h.in | 1 + cmake/kokkoskernels_tpls.cmake | 4 ++++ sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 2 +- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 2 +- 8 files changed, 18 insertions(+), 7 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 7bc55becc0..7dedf35c8b 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -88,7 +88,9 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) \ + && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) \ + && defined(KOKKOS_ENABLE_SYCL) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #endif diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 62139d2b12..bffa9e2a0c 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -364,7 +364,9 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) \ + && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) \ + && defined(KOKKOS_ENABLE_SYCL) #include #include #include @@ -437,6 +439,6 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(false) } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL && KOKKOS_ENABLE_SYCL #endif diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 70b5560f6e..0b6ca09a5f 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -161,7 +161,7 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#ifdef KOKKOS_ENABLE_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) #define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ template \ diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 304dd349bf..42a6f1ebf4 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -767,7 +767,9 @@ KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS // ONEMKL -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) \ + && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) \ + && defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 7a61771231..d94860e380 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -29,6 +29,7 @@ requires (a) header file(s) as well, and may use functions other than just BLAS and LAPACK functions. */ #cmakedefine HAVE_KOKKOSKERNELS_MKL +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE #cmakedefine KOKKOSKERNELS_ENABLE_BENCHMARK diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index f650168757..08c7158148 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -31,6 +31,10 @@ MACRO(KOKKOSKERNELS_ADD_TPL_OPTION NAME DEFAULT_VALUE DOCSTRING) SET(ROOT_DEFAULT $ENV{${_NAME_ORIG}_ROOT}) KOKKOSKERNELS_ADD_OPTION(${_NAME_ORIG}_ROOT "${ROOT_DEFAULT}" PATH "Location of ${_NAME} install root. Default: None or the value of the environment variable ${_NAME}_ROOT if set") IF (DEFINED TPL_ENABLE_${_NAME}) + IF (${_NAME} STREQUAL MKL AND KOKKOSKERNELS_HAS_TRILINOS) + MESSAGE("Trilinos has enabled MKL and SYCL but it does not detect oneMKL correctly so we disable it!") + SET(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE ON) + ENDIF () IF (TPL_ENABLE_${_NAME} AND NOT KOKKOSKERNELS_ENABLE_TPL_${_NAME}) MESSAGE("Overriding KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG}=OFF with TPL_ENABLE_${_NAME}=ON") SET(KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG} ON) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 01a0ce1373..f6d8e0806f 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -249,7 +249,7 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif -#ifdef KOKKOS_ENABLE_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 6ac5f49296..0540bce46a 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -686,7 +686,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #undef KOKKOSSPARSE_SPMV_MKL #endif -#ifdef KOKKOS_ENABLE_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { switch (toupper(mode_kk)) { case 'N': return oneapi::mkl::transpose::nontrans; From a0c9b7555d3b1eaba7f2b796e2803be1a1471999 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Sun, 5 Nov 2023 14:12:30 -0700 Subject: [PATCH 065/326] Apply clang-format to non-cmake files --- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 6 +++--- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 6 +++--- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 3 ++- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 6 +++--- sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 3 ++- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 3 ++- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 7dedf35c8b..de930f6107 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -88,9 +88,9 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) \ - && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) \ - && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #endif diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index bffa9e2a0c..736523aa8d 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -364,9 +364,9 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) \ - && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) \ - && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 0b6ca09a5f..0820badd9a 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -161,7 +161,8 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) #define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ template \ diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 42a6f1ebf4..2ace065808 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -767,9 +767,9 @@ KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS // ONEMKL -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) \ - && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) \ - && defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index f6d8e0806f..653ec94811 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -249,7 +249,8 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 0540bce46a..e12ee23937 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -686,7 +686,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #undef KOKKOSSPARSE_SPMV_MKL #endif -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { switch (toupper(mode_kk)) { case 'N': return oneapi::mkl::transpose::nontrans; From 36214320320daaa58005c3782dfa259a22d4fed8 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Sun, 5 Nov 2023 08:51:36 -0700 Subject: [PATCH 066/326] SYR2: fix issue with bad type in test function After comparing various function signatures and view types, the change allows tests to pass correctly and seem correct based on input params. --- blas/unit_test/Test_Blas2_syr2.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 780e9ce162..df42d2fdd4 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -163,7 +163,7 @@ class Syr2Tester { void callKkGerAndCompareKkSyr2AgainstIt( const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& org_A, - const _ViewTypeExpected& h_A_syr2, const std::string& situation); + const _HostViewTypeA& h_A_syr2, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -1574,7 +1574,7 @@ void Syr2Tester& org_A, - const _ViewTypeExpected& h_A_syr2, const std::string& situation) { + const _HostViewTypeA& h_A_syr2, const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, org_A.d_base); @@ -1685,7 +1685,7 @@ void Syr2TestercompareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.h_view); + this->compareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.d_view); } } // namespace Test From 196fa447c75ab7f6fe733faf0c23978e94094cba Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 6 Nov 2023 16:32:41 -0700 Subject: [PATCH 067/326] Update Test_Blas2_syr2.hpp Fix mistake in host/device view argument --- blas/unit_test/Test_Blas2_syr2.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index df42d2fdd4..76b2cf43c1 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -1685,7 +1685,7 @@ void Syr2TestercompareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.d_view); + this->compareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.h_view); } } // namespace Test From 24c73c8f73ec3ac0c663b61cb2130ca6908d4cbe Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 9 Nov 2023 12:57:08 -0700 Subject: [PATCH 068/326] LAPACK: adding rocsolver TPL Adding the necessary CMake logic and TPL layer to support rocsolver for LAPACK. Enabling the TPL in gesv and updating gesv test to run by default the more common configurations and only run specific ones when the associated TPL (MAGMA) is enabled. --- CMakeLists.txt | 1 + cmake/Dependencies.cmake | 2 +- cmake/KokkosKernels_config.h.in | 2 + cmake/Modules/FindTPLROCSOLVER.cmake | 9 + cmake/kokkoskernels_features.cmake | 12 + cmake/kokkoskernels_tpls.cmake | 8 +- .../src/KokkosKernels_PrintConfiguration.hpp | 7 + lapack/CMakeLists.txt | 7 - lapack/impl/KokkosLapack_gesv_spec.hpp | 44 +- lapack/src/KokkosLapack_gesv.hpp | 44 +- .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 86 ++-- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 442 ++++++++---------- lapack/unit_test/Test_Lapack_gesv.hpp | 164 ++++--- scripts/cm_test_all_sandia | 4 +- 14 files changed, 428 insertions(+), 404 deletions(-) create mode 100644 cmake/Modules/FindTPLROCSOLVER.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 812640374b..fc41d40452 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -377,6 +377,7 @@ ELSE() KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSPARSE) + KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MAGMA) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 777d4445b3..104d153347 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES Kokkos - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE ROCBLAS ROCSPARSE ROCSOLVER TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index d94860e380..c40a2b18a7 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -138,6 +138,8 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCBLAS /* ROCSPARSE */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +/* ROCSOLVER */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV diff --git a/cmake/Modules/FindTPLROCSOLVER.cmake b/cmake/Modules/FindTPLROCSOLVER.cmake new file mode 100644 index 0000000000..8f2a92cfda --- /dev/null +++ b/cmake/Modules/FindTPLROCSOLVER.cmake @@ -0,0 +1,9 @@ +# LBV: 11/08/2023: This file follows the partern of FindTPLROCBLAS.cmake/FindTPLROCSPARSE.cmake +FIND_PACKAGE(ROCSOLVER) +if(TARGET roc::rocsolver) + SET(TPL_ROCSOLVER_IMPORTED_NAME roc::rocsolver) + SET(TPL_IMPORTED_NAME roc::rocsolver) + ADD_LIBRARY(KokkosKernels::ROCSOLVER ALIAS roc::rocsolver) +ELSE() + MESSAGE(FATAL_ERROR "Package ROCSOLVER requested but not found") +ENDIF() diff --git a/cmake/kokkoskernels_features.cmake b/cmake/kokkoskernels_features.cmake index aacc1c8451..3ecc95d6b5 100644 --- a/cmake/kokkoskernels_features.cmake +++ b/cmake/kokkoskernels_features.cmake @@ -27,3 +27,15 @@ IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNE INCLUDE(CheckHostBlasReturnComplex.cmake) CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) ENDIF() + +# ================================================================== +# Lapack requirements +# ================================================================== + +IF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + MESSAGE(FATAL_ERROR "rocSOLVER requires rocBLAS and rocSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE:BOOL=ON.") +ELSEIF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + MESSAGE(FATAL_ERROR "rocSOLVER requires rocSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE:BOOL=ON.") +ELSEIF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) + MESSAGE(FATAL_ERROR "rocSOLVER requires rocBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCBLAS:BOOL=ON.") +ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 08c7158148..2f54278d1b 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -460,15 +460,18 @@ KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_ROCM_TPLS OFF BOOL "Whether ROCM TPLs should # Unlike CUDA, ROCm does not automatically install these TPLs SET(ROCBLAS_DEFAULT OFF) SET(ROCSPARSE_DEFAULT OFF) +SET(ROCSOLVER_DEFAULT OFF) # Since the default is OFF we do not really need this piece of logic here. # IF(KOKKOSKERNELS_NO_DEFAULT_ROCM_TPLS) # SET(ROCBLAS_DEFAULT OFF) # SET(ROCSPARSE_DEFAULT OFF) # ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(ROCBLAS ${ROCBLAS_DEFAULT} "Whether to enable ROCBLAS" - DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") + DEFAULT_DOCSTRING "OFF even if HIP-enabled Kokkos") KOKKOSKERNELS_ADD_TPL_OPTION(ROCSPARSE ${ROCSPARSE_DEFAULT} "Whether to enable ROCSPARSE" - DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") + DEFAULT_DOCSTRING "OFF even if HIP-enabled Kokkos") +KOKKOSKERNELS_ADD_TPL_OPTION(ROCSOLVER ${ROCSOLVER_DEFAULT} "Whether to enable ROCSOLVER" + DEFAULT_DOCSTRING "OFF even if HIP-enabled Kokkos") IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) IF (F77_BLAS_MANGLE STREQUAL "(name,NAME) name ## _") @@ -507,6 +510,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(MAGMA) KOKKOSKERNELS_IMPORT_TPL(ROCBLAS) KOKKOSKERNELS_IMPORT_TPL(ROCSPARSE) + KOKKOSKERNELS_IMPORT_TPL(ROCSOLVER) ELSE () IF (Trilinos_ENABLE_SuperLU5_API) SET(HAVE_KOKKOSKERNELS_SUPERLU5_API TRUE) diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index cd2333b3ec..55e7285ed2 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -110,6 +110,13 @@ inline void print_enabled_tpls(std::ostream& os) { os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE: no\n"; #endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCOLVER: no\n"; +#endif #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS os << "KOKKOSKERNELS_ENABLE_TPL_METIS: yes\n"; #else diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 8ab784a325..ee91079378 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -34,13 +34,6 @@ IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) ) ENDIF() -# Include rocm lapack TPL source file -IF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER) - LIST(APPEND SOURCES - lapack/tpls/KokkosLapack_Rocm_tpl.cpp - ) -ENDIF() - ################## # # # ETI generation # diff --git a/lapack/impl/KokkosLapack_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp index b9f8549311..57098f75fc 100644 --- a/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gesv_eti_spec_avail { enum : bool { value = false }; }; @@ -46,12 +46,16 @@ struct gesv_eti_spec_avail { EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template <> \ struct gesv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits > > { \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ enum : bool { value = true }; \ }; @@ -65,20 +69,24 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosLapack::gesv. -template ::value, - bool eti_spec_avail = gesv_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + gesv_eti_spec_avail::value> struct GESV { - static void gesv(const AMatrix &A, const BXMV &B, const IPIVV &IPIV); + static void gesv(const ExecutionSpace &space, const AMatrix &A, const BXMV &B, + const IPIVV &IPIV); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of gesv for multi vectors. // Unification layer -template -struct GESV { - static void gesv(const AMatrix & /* A */, const BXMV & /* B */, - const IPIVV & /* IPIV */) { +template +struct GESV { + static void gesv(const ExecutionSpace & /* space */, const AMatrix & /* A */, + const BXMV & /* B */, const IPIVV & /* IPIV */) { // NOTE: Might add the implementation of KokkosLapack::gesv later throw std::runtime_error( "No fallback implementation of GESV (general LU factorization & solve) " @@ -100,31 +108,33 @@ struct GESV { #define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ extern template struct GESV< \ + EXEC_SPACE_TYPE, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ false, true>; #define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GESV< \ + EXEC_SPACE_TYPE, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp index 4c9058f8ab..74d2e01cf9 100644 --- a/lapack/src/KokkosLapack_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -34,28 +34,40 @@ namespace KokkosLapack { /// \brief Solve the dense linear equation system A*X = B. /// +/// \tparam ExecutionSpace the space where the kernel will run. /// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View. /// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a -/// 1-D or 2-D Kokkos::View. \tparam IPIVV Output pivot indices, as a 1-D -/// Kokkos::View +/// 1-D or 2-D Kokkos::View. +/// \tparam IPIVV Output pivot indices, as a 1-D Kokkos::View /// /// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the /// factors L and U from /// the factorization A = P*L*U; the unit diagonal elements of L are not /// stored. /// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, -/// the solution (multi)vector X. \param IPIV [out] On exit, the pivot indices -/// (for partial pivoting). If the View extents are zero and -/// its data pointer is NULL, pivoting is not used. +/// the solution (multi)vector X. +/// \param IPIV [out] On exit, the pivot indices (for partial pivoting). +/// If the View extents are zero and its data pointer is NULL, pivoting is not +/// used. /// -template -void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { - // NOTE: Currently, KokkosLapack::gesv only supports for MAGMA TPL and LAPACK - // TPL. - // MAGMA TPL should be enabled to call the MAGMA GPU interface for - // device views LAPACK TPL should be enabled to call the LAPACK - // interface for host views +template +void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, + const IPIVV& IPIV) { + // NOTE: Currently, KokkosLapack::gesv only supports LAPACK, MAGMA and + // rocSOLVER TPLs. + // MAGMA/rocSOLVER TPL should be enabled to call the MAGMA/rocSOLVER GPU + // interface for device views LAPACK TPL should be enabled to call the + // LAPACK interface for host views + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: A must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -137,12 +149,12 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { if (BXMV::rank == 1) { auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); - KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); } else { // BXMV::rank == 2 auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); - KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); } } diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index a3d8bb6ee9..fc8f634078 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gesv_tpl_spec_avail { enum : bool { value = false }; }; @@ -31,9 +31,12 @@ struct gesv_tpl_spec_avail { #define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -46,37 +49,22 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -/* -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( double, Kokkos::LayoutRight, -Kokkos::HostSpace) #endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( float, Kokkos::LayoutRight, -Kokkos::HostSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, -Kokkos::LayoutRight, Kokkos::HostSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, -Kokkos::LayoutRight, Kokkos::HostSpace) #endif -*/ #endif // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct gesv_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View > > { \ + enum : bool { value = true }; \ }; KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, @@ -88,24 +76,32 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -/* -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, -Kokkos::CudaSpace) #endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, -Kokkos::CudaSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( -Kokkos::complex,Kokkos::LayoutRight, Kokkos::CudaSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, -Kokkos::LayoutRight, Kokkos::CudaSpace) #endif -*/ +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER + +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) + #endif } // namespace Impl diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 2baa76a132..957ac7c138 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -45,229 +45,84 @@ inline void gesv_print_specialization() { namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_DGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,double]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), \ - B.data(), LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +template +void lapackGesvWrapper(const AViewType& A, const BViewType& B, + const IPIVViewType& IPIV) { + using Scalar = typename AViewType::non_const_value_type; -#define KOKKOSLAPACK_SGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,float]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; + const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); -#define KOKKOSLAPACK_ZGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack >::gesv( \ - N, NRHS, reinterpret_cast*>(A.data()), LDA, \ - IPIV.data(), reinterpret_cast*>(B.data()), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; + const int N = static_cast(A.extent(1)); + const int AST = static_cast(A.stride(1)); + const int LDA = (AST == 0) ? 1 : AST; + const int BST = static_cast(B.stride(1)); + const int LDB = (BST == 0) ? 1 : BST; + const int NRHS = static_cast(B.extent(1)); -#define KOKKOSLAPACK_CGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack >::gesv( \ - N, NRHS, reinterpret_cast*>(A.data()), LDA, \ - IPIV.data(), reinterpret_cast*>(B.data()), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ + int info = 0; + + if (with_pivot) { + if constexpr (Kokkos::ArithTraits::is_complex) { + using MagType = typename Kokkos::ArithTraits::mag_type; + + HostLapack>::gesv( + N, NRHS, reinterpret_cast*>(A.data()), LDA, + IPIV.data(), reinterpret_cast*>(B.data()), LDB, + info); + } else { + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), + LDB, info); + } + } +} + +#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GESV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using AViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View>; \ + \ + static void gesv(const ExecSpace& /* space */, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR \ + "]"); \ + gesv_print_specialization(); \ + lapackGesvWrapper(A, B, IPIV); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, true) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, false) -KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, true) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosLapack @@ -284,27 +139,27 @@ namespace Impl { template \ struct GESV< \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ true, ETI_SPEC_AVAIL> { \ typedef double SCALAR; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ magma_int_t*, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ PViewType; \ \ static void gesv(const AViewType& A, const BViewType& B, \ @@ -343,27 +198,27 @@ namespace Impl { template \ struct GESV< \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ true, ETI_SPEC_AVAIL> { \ typedef float SCALAR; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ magma_int_t*, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ PViewType; \ \ static void gesv(const AViewType& A, const BViewType& B, \ @@ -402,28 +257,28 @@ namespace Impl { template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View**, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ magma_int_t*, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ PViewType; \ \ static void gesv(const AViewType& A, const BViewType& B, \ @@ -463,28 +318,28 @@ namespace Impl { template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View**, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ magma_int_t*, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits> \ PViewType; \ \ static void gesv(const AViewType& A, const BViewType& B, \ @@ -536,4 +391,113 @@ KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA +// ROCSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#include +#include + +namespace KokkosLapack { +namespace Impl { + +template +void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, + const AViewType& A, const BViewType& B) { + using Scalar = typename BViewType::non_const_value_type; + using ALayout_t = typename AViewType::array_layout; + using BLayout_t = typename BViewType::array_layout; + + const rocblas_int N = static_cast(A.extent(0)); + const rocblas_int nrhs = static_cast(B.extent(1)); + const rocblas_int lda = std::is_same_v + ? A.stride(0) + : A.stride(1); + const rocblas_int ldb = std::is_same_v + ? B.stride(0) + : B.stride(1); + Kokkos::View info("rocsolver info"); + + KokkosBlas::Impl::RocBlasSingleton& s = + KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_set_stream(s.handle, space.hip_stream())); + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_sgesv(s.handle, N, nrhs, A.data(), lda, + reinterpret_cast(IPIV.data()), B.data(), + ldb, info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_dgesv(s.handle, N, nrhs, A.data(), lda, + reinterpret_cast(IPIV.data()), B.data(), + ldb, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv( + s.handle, N, nrhs, reinterpret_cast(A.data()), + lda, reinterpret_cast(IPIV.data()), + reinterpret_cast(B.data()), ldb, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesv( + s.handle, N, nrhs, reinterpret_cast(A.data()), + lda, reinterpret_cast(IPIV.data()), + reinterpret_cast(B.data()), ldb, info.data())); + } + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GESV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using AViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const ExecSpace& space, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::gesv[TPL_ROCSOLVER," #SCALAR "]"); \ + gesv_print_specialization(); \ + \ + rocsolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace, true) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace, false) + +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace, true) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace, false) + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER + #endif diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 06f51b7eb0..e1cf743f91 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -16,11 +16,13 @@ // only enable this test where KokkosLapack supports gesv: // CUDA+MAGMA and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || \ - defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || \ + defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include @@ -34,11 +36,13 @@ namespace Test { -template +template void impl_test_gesv(const char* mode, const char* padding, int N) { - typedef typename Device::execution_space execution_space; - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits ats; + using execution_space = typename Device::execution_space; + using ScalarA = typename ViewTypeA::value_type; + using ats = Kokkos::ArithTraits; + + execution_space space{}; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -80,7 +84,9 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - typedef Kokkos::View ViewTypeP; + using ViewTypeP = typename std::conditional< + MAGMA, Kokkos::View, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -90,7 +96,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Solve. try { - KokkosLapack::gesv(A, B, ipiv); + KokkosLapack::gesv(space, A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) @@ -138,12 +144,14 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { ASSERT_EQ(test_flag, true); } -template +template void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) { - typedef typename Device::execution_space execution_space; - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits ats; + using execution_space = typename Device::execution_space; + using ScalarA = typename ViewTypeA::value_type; + using ats = Kokkos::ArithTraits; + + execution_space space{}; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -185,7 +193,9 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - typedef Kokkos::View ViewTypeP; + using ViewTypeP = typename std::conditional< + MAGMA, Kokkos::View, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -195,7 +205,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, // Solve. try { - KokkosLapack::gesv(A, B, ipiv); + KokkosLapack::gesv(space, A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) @@ -253,41 +263,44 @@ int test_gesv(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ll; - typedef Kokkos::View view_type_b_ll; - Test::impl_test_gesv( + using view_type_a_ll = Kokkos::View; + using view_type_b_ll = Kokkos::View; + + Test::impl_test_gesv( &mode[0], "N", 2); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 13); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 179); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 64); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 1024); // no padding - Test::impl_test_gesv(&mode[0], "Y", - 13); // padding - Test::impl_test_gesv(&mode[0], "Y", - 179); // padding + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + Test::impl_test_gesv( + &mode[0], "N", 2); // no padding + Test::impl_test_gesv( + &mode[0], "N", 13); // no padding + Test::impl_test_gesv( + &mode[0], "N", 179); // no padding + Test::impl_test_gesv( + &mode[0], "N", 64); // no padding + Test::impl_test_gesv( + &mode[0], "N", 1024); // no padding + + Test::impl_test_gesv( + &mode[0], "Y", + 13); // padding + Test::impl_test_gesv( + &mode[0], "Y", + 179); // padding + } +#endif #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || - (!defined(KOKKOSKERNELS_ETI_ONLY) && - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; - Test::impl_test_gesv(&mode[0], "N", - 2); //no padding Test::impl_test_gesv(&mode[0], "N", 13); //no padding Test::impl_test_gesv(&mode[0], "N", 179); //no padding - Test::impl_test_gesv(&mode[0], "N", - 64); //no padding Test::impl_test_gesv(&mode[0], "N", 1024);//no padding Test::impl_test_gesv(&mode[0], "Y", 13); //padding - Test::impl_test_gesv(&mode[0], "Y", - 179); //padding #endif - */ // Supress unused parameters on CUDA10 (void)mode; return 1; @@ -298,42 +311,43 @@ int test_gesv_mrhs(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ll; - typedef Kokkos::View view_type_b_ll; - Test::impl_test_gesv_mrhs( + using view_type_a_ll = Kokkos::View; + using view_type_b_ll = Kokkos::View; + + Test::impl_test_gesv_mrhs( &mode[0], "N", 2, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 13, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 179, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 64, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 1024, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 13, 5); // padding - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 179, 5); // padding + +// When appropriate run MAGMA specific tests +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + Test::impl_test_gesv_mrhs( + &mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 1024, 5); // no padding + + Test::impl_test_gesv_mrhs( + &mode[0], "Y", 13, 5); // padding + Test::impl_test_gesv_mrhs( + &mode[0], "Y", 179, 5); // padding + } +#endif #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || - (!defined(KOKKOSKERNELS_ETI_ONLY) && - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; - Test::impl_test_gesv_mrhs(&mode[0], - "N", 2, 5);//no padding Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5);//no padding - Test::impl_test_gesv_mrhs(&mode[0], - "N", 179, 5);//no padding Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5);//no padding - Test::impl_test_gesv_mrhs(&mode[0], - "N", 1024,5);//no padding Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5);//padding - Test::impl_test_gesv_mrhs(&mode[0], - "Y", 179, 5);//padding #endif - */ // Supress unused parameters on CUDA10 (void)mode; return 1; @@ -411,4 +425,4 @@ TEST_F(TestCategory, gesv_mrhs_complex_float) { } #endif -#endif // CUDA+MAGMA or LAPACK+HOST +#endif // CUDA+MAGMA or HIP+ROCSOLVER or LAPACK+HOST diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 730f3c5382..3a8079dc66 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -91,7 +91,7 @@ print_help() { echo "--with-tpls=TPLS: set KOKKOSKERNELS_ENABLE_TPLS" echo " Provide a comma-separated list of TPLs" echo " Valid items:" - echo " blas, mkl, cublas, cusparse, magma, armpl, rocblas, rocsparse" + echo " blas, mkl, cublas, cusparse, magma, armpl, rocblas, rocsparse, rocsolver" echo "" echo "ARGS: list of expressions matching compilers to test" @@ -1087,7 +1087,7 @@ setup_env() { export KOKKOS_CUDA_OPTIONS="${KOKKOS_CUDA_OPTIONS},enable_lambda" fi if [[ "$compiler" == rocm* ]]; then - NEW_TPL_LIST="rocblas,rocsparse," + NEW_TPL_LIST="rocblas,rocsparse,rocsolver," fi # host tpls - use mkl with intel, else use host blas if [[ "$compiler" == intel* ]]; then From c06b8db52636128517e169e1952b8b06f0a37335 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 14 Nov 2023 20:50:02 -0700 Subject: [PATCH 069/326] Lapack: change according to Brian's review The SpaceAccessibility of IPIVV needs to be modified for MAGMA. The value_type of IPIVV needs to be rocblas_int when running with rocSOLVER. The types used for gesv_tpl_spec_avail and the actual TPL instantiation where mismatched leading to linker error. --- lapack/src/KokkosLapack_gesv.hpp | 8 ++ .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 13 ++- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 81 +++++++++---------- 3 files changed, 56 insertions(+), 46 deletions(-) diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp index 74d2e01cf9..a37cfd95fe 100644 --- a/lapack/src/KokkosLapack_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -65,9 +65,17 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, static_assert( Kokkos::SpaceAccessibility::accessible); +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + if constexpr (!std::is_same_v) { + static_assert( + Kokkos::SpaceAccessibility::accessible); + } +#else static_assert( Kokkos::SpaceAccessibility::accessible); +#endif static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: A must be a Kokkos::View."); static_assert(Kokkos::is_view::value, diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index fc8f634078..e7bc5425f7 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -75,10 +75,15 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) - #endif +} // namespace Impl +} // namespace KokkosLapack #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#include + +namespace KokkosLapack { +namespace Impl { #define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ template <> \ @@ -88,7 +93,8 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -102,9 +108,8 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) -#endif - } // namespace Impl } // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER #endif diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 957ac7c138..d3a71a0cfa 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -76,36 +76,35 @@ void lapackGesvWrapper(const AViewType& A, const BViewType& B, } } -#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using AViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = Kokkos::View>; \ - \ - static void gesv(const ExecSpace& /* space */, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR \ - "]"); \ - gesv_print_specialization(); \ - lapackGesvWrapper(A, B, IPIV); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GESV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using AViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const ExecSpace& /* space */, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR \ + "]"); \ + gesv_print_specialization(); \ + lapackGesvWrapper(A, B, IPIV); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace, true) @@ -422,28 +421,26 @@ void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocsolver_sgesv(s.handle, N, nrhs, A.data(), lda, - reinterpret_cast(IPIV.data()), B.data(), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesv(s.handle, N, nrhs, A.data(), + lda, IPIV.data(), B.data(), + ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocsolver_dgesv(s.handle, N, nrhs, A.data(), lda, - reinterpret_cast(IPIV.data()), B.data(), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesv(s.handle, N, nrhs, A.data(), + lda, IPIV.data(), B.data(), + ldb, info.data())); } if constexpr (std::is_same_v>) { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv( s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, reinterpret_cast(IPIV.data()), - reinterpret_cast(B.data()), ldb, info.data())); + lda, IPIV.data(), reinterpret_cast(B.data()), + ldb, info.data())); } if constexpr (std::is_same_v>) { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesv( s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, reinterpret_cast(IPIV.data()), - reinterpret_cast(B.data()), ldb, info.data())); + lda, IPIV.data(), reinterpret_cast(B.data()), + ldb, info.data())); } KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } From 18ca910d3f13fbb977fed98f88e84dbb5d4d776d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 15 Nov 2023 19:51:55 -0700 Subject: [PATCH 070/326] cmake/Dependencies.cmake: remove ROCSOLVER Removing ROCSOLVER to prevent configuration errors with Trilinos Will bring back when support is added in Trilinos for ROCSOLVER as TPL --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 104d153347..d5b2a1d8e9 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES Kokkos - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE ROCBLAS ROCSPARSE ROCSOLVER + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in From 4f3549d995e4883c4cb05743850236ec71be2e88 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 15 Nov 2023 14:42:19 -0700 Subject: [PATCH 071/326] Lapack: cusolver TPL logic and support for gesv Adding CMake logic to support cusolver and implementing gesv using cusolver getrf and getrs. Unit-test is passing without problems! --- CMakeLists.txt | 1 + cmake/Dependencies.cmake | 2 +- cmake/KokkosKernels_config.h.in | 6 +- cmake/Modules/FindTPLCUSOLVER.cmake | 17 ++ cmake/kokkoskernels_features.cmake | 8 + cmake/kokkoskernels_tpls.cmake | 5 + .../src/KokkosKernels_PrintConfiguration.hpp | 12 ++ common/src/KokkosKernels_TplsVersion.hpp | 15 ++ lapack/impl/KokkosLapack_gesv_spec.hpp | 2 +- lapack/tpls/KokkosLapack_Cuda_tpl.hpp | 2 +- lapack/tpls/KokkosLapack_cusolver.hpp | 92 +++++++++++ .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 31 ++++ .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 145 ++++++++++++++++++ lapack/unit_test/Test_Lapack_gesv.hpp | 30 ++-- scripts/cm_test_all_sandia | 4 +- 15 files changed, 351 insertions(+), 21 deletions(-) create mode 100644 cmake/Modules/FindTPLCUSOLVER.cmake create mode 100644 lapack/tpls/KokkosLapack_cusolver.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index fc41d40452..fb5d0591d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -375,6 +375,7 @@ ELSE() KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) + KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSOLVER) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index d5b2a1d8e9..13223259ef 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES Kokkos - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE ROCBLAS ROCSPARSE + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE CUSOLVER ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index c40a2b18a7..6f5b07f287 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -114,10 +114,12 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL -/* CUSPARSE */ -#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSPARSE /* CUBLAS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUBLAS +/* CUSPARSE */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +/* CUSOLVER */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSOLVER /* MAGMA */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MAGMA /* SuperLU */ diff --git a/cmake/Modules/FindTPLCUSOLVER.cmake b/cmake/Modules/FindTPLCUSOLVER.cmake new file mode 100644 index 0000000000..4b75aefd65 --- /dev/null +++ b/cmake/Modules/FindTPLCUSOLVER.cmake @@ -0,0 +1,17 @@ +FIND_PACKAGE(CUDA) + +INCLUDE(FindPackageHandleStandardArgs) +IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUSOLVER + #The eventual target is named CUSOLVER. To avoid naming conflicts + #the find module is called TPLCUSOLVER. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_FOUND) +ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_cusolver_LIBRARY}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_cusolver_LIBRARY) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSOLVER LIBRARY ${CUDA_cusolver_LIBRARY}) + ENDIF() +ENDIF() diff --git a/cmake/kokkoskernels_features.cmake b/cmake/kokkoskernels_features.cmake index 3ecc95d6b5..cbd2a848ef 100644 --- a/cmake/kokkoskernels_features.cmake +++ b/cmake/kokkoskernels_features.cmake @@ -39,3 +39,11 @@ ELSEIF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCS ELSEIF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) MESSAGE(FATAL_ERROR "rocSOLVER requires rocBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCBLAS:BOOL=ON.") ENDIF() + +IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS and cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") +ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") +ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON.") +ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 2f54278d1b..d1a44721e6 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -447,14 +447,18 @@ ENDIF() KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_CUDA_TPLS OFF BOOL "Whether CUDA TPLs should be enabled by default. Default: OFF") SET(CUBLAS_DEFAULT ${KOKKOS_ENABLE_CUDA}) SET(CUSPARSE_DEFAULT ${KOKKOS_ENABLE_CUDA}) +SET(CUSOLVER_DEFAULT ${KOKKOS_ENABLE_CUDA}) IF(KOKKOSKERNELS_NO_DEFAULT_CUDA_TPLS) SET(CUBLAS_DEFAULT OFF) SET(CUSPARSE_DEFAULT OFF) + SET(CUSOLVER_DEFAULT OFF) ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(CUBLAS ${CUBLAS_DEFAULT} "Whether to enable CUBLAS" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_TPL_OPTION(CUSPARSE ${CUSPARSE_DEFAULT} "Whether to enable CUSPARSE" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") +KOKKOSKERNELS_ADD_TPL_OPTION(CUSOLVER ${CUSOLVER_DEFAULT} "Whether to enable CUSOLVER" + DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_ROCM_TPLS OFF BOOL "Whether ROCM TPLs should be enabled by default. Default: OFF") # Unlike CUDA, ROCm does not automatically install these TPLs @@ -501,6 +505,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(MKL) KOKKOSKERNELS_IMPORT_TPL(CUBLAS) KOKKOSKERNELS_IMPORT_TPL(CUSPARSE) + KOKKOSKERNELS_IMPORT_TPL(CUSOLVER) KOKKOSKERNELS_IMPORT_TPL(CBLAS) KOKKOSKERNELS_IMPORT_TPL(LAPACKE) KOKKOSKERNELS_IMPORT_TPL(CHOLMOD) diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index 55e7285ed2..b5670568e0 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -44,6 +44,18 @@ inline void print_cusparse_version_if_enabled(std::ostream& os) { << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; #endif } + +inline void print_cusolver_version_if_enabled(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: " << cusolver_version_string() + << "\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: no\n"; +#endif +} + inline void print_enabled_tpls(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK os << " " diff --git a/common/src/KokkosKernels_TplsVersion.hpp b/common/src/KokkosKernels_TplsVersion.hpp index 38de7c1399..3e00d72457 100644 --- a/common/src/KokkosKernels_TplsVersion.hpp +++ b/common/src/KokkosKernels_TplsVersion.hpp @@ -28,6 +28,10 @@ #include "cusparse.h" #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +#include "cusolver_common.h" +#endif + namespace KokkosKernels { #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) @@ -53,5 +57,16 @@ inline std::string cusparse_version_string() { } #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +inline std::string cusolver_version_string() { + std::stringstream ss; + + ss << CUSOLVER_VER_MAJOR << "." << CUSOLVER_VER_MINOR << "." + << CUSOLVER_VER_PATCH << "." << CUSOLVER_VER_BUILD; + + return ss.str(); +} +#endif + } // namespace KokkosKernels #endif // _KOKKOSKERNELS_TPLS_VERSIONS_HPP diff --git a/lapack/impl/KokkosLapack_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp index 57098f75fc..97d74280ff 100644 --- a/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -90,7 +90,7 @@ struct GESV +#include "KokkosLapack_cusolver.hpp" namespace KokkosLapack { namespace Impl { diff --git a/lapack/tpls/KokkosLapack_cusolver.hpp b/lapack/tpls/KokkosLapack_cusolver.hpp new file mode 100644 index 0000000000..006fd68b6f --- /dev/null +++ b/lapack/tpls/KokkosLapack_cusolver.hpp @@ -0,0 +1,92 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_CUSOLVER_HPP_ +#define KOKKOSLAPACK_CUSOLVER_HPP_ + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include + +namespace KokkosLapack { +namespace Impl { + +// Declaration of the singleton for cusolver +// this is the only header that needs to be +// included when using cusolverDn. +struct CudaLapackSingleton { + cusolverDnHandle_t handle; + + CudaLapackSingleton(); + + static CudaLapackSingleton& singleton(); +}; + +inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, + const char* name, const char* file, + const int line) { + std::ostringstream out; + out << name << " error( "; + switch (cusolverStatus) { + case CUSOLVER_STATUS_NOT_INITIALIZED: + out << "CUSOLVER_STATUS_NOT_INITIALIZED): cusolver handle was not " + "created correctly."; + break; + case CUSOLVER_STATUS_ALLOC_FAILED: + out << "CUSOLVER_STATUS_ALLOC_FAILED): you might tried to allocate too " + "much memory"; + break; + case CUSOLVER_STATUS_INVALID_VALUE: + out << "CUSOLVER_STATUS_INVALID_VALUE)"; + break; + case CUSOLVER_STATUS_ARCH_MISMATCH: + out << "CUSOLVER_STATUS_ARCH_MISMATCH)"; + break; + case CUSOLVER_STATUS_EXECUTION_FAILED: + out << "CUSOLVER_STATUS_EXECUTION_FAILED)"; + break; + case CUSOLVER_STATUS_INTERNAL_ERROR: + out << "CUSOLVER_STATUS_INTERNAL_ERROR)"; + break; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + out << "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED)"; + break; + default: out << "unrecognized error code): this is bad!"; break; + } + if (file) { + out << " " << file << ":" << line; + } + throw std::runtime_error(out.str()); +} + +inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, + const char* name, + const char* file = nullptr, + const int line = 0) { + if (CUSOLVER_STATUS_SUCCESS != cusolverStatus) { + cusolver_internal_error_throw(cusolverStatus, name, file, line); + } +} + +// The macro below defines is the public interface for the safe cusolver calls. +// The functions themselves are protected by impl namespace. +#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, \ + __LINE__) + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#endif // KOKKOSLAPACK_CUSOLVER_HPP_ diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index e7bc5425f7..b7c336681f 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -79,6 +79,37 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, } // namespace Impl } // namespace KokkosLapack +// CUSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +namespace KokkosLapack { +namespace Impl { + +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) + +} // namespace Impl +} // namespace KokkosLapack +#endif // CUSOLVER + #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER #include diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index d3a71a0cfa..228c42f323 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -390,6 +390,151 @@ KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA +// ROCSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" + +namespace KokkosLapack { +namespace Impl { + +template +void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, + const AViewType& A, const BViewType& B) { + using memory_space = typename AViewType::memory_space; + using Scalar = typename BViewType::non_const_value_type; + using ALayout_t = typename AViewType::array_layout; + using BLayout_t = typename BViewType::array_layout; + + const int m = A.extent_int(0); + const int n = A.extent_int(1); + const int lda = std::is_same_v ? A.stride(0) + : A.stride(1); + + (void)B; + + const int nrhs = B.extent_int(1); + const int ldb = std::is_same_v ? B.stride(0) + : B.stride(1); + int lwork = 0; + Kokkos::View info("getrf info"); + + CudaLapackSingleton& s = CudaLapackSingleton::singleton(); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSetStream(s.handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf(s.handle, m, n, A.data(), + lda, Workspace.data(), + IPIV.data(), info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, + IPIV.data(), B.data(), ldb, info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf(s.handle, m, n, A.data(), + lda, Workspace.data(), + IPIV.data(), info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, + IPIV.data(), B.data(), ldb, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf_bufferSize( + s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), + lda, reinterpret_cast(Workspace.data()), + IPIV.data(), info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs( + s.handle, CUBLAS_OP_N, m, nrhs, reinterpret_cast(A.data()), + lda, IPIV.data(), reinterpret_cast(B.data()), ldb, + info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf_bufferSize( + s.handle, m, n, reinterpret_cast(A.data()), lda, + &lwork)); + Kokkos::View Workspace("getrf workspace", + lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf( + s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs( + s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); + } + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GESV< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_CUSOLVER," #SCALAR \ + "]"); \ + gesv_print_specialization(); \ + \ + cusolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace, true) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace, false) + +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace, true) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace, false) + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + // ROCSOLVER #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER #include diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index e1cf743f91..5796659183 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -15,15 +15,15 @@ //@HEADER // only enable this test where KokkosLapack supports gesv: -// CUDA+MAGMA and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ - (defined(TEST_HIP_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || \ - defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ - defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) +// CUDA+(MAGMA or CUSOLVER), HIP+ROCSOLVER and HOST+LAPACK +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || \ + defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ + defined(TEST_THREADS_LAPACK_CPP))) #include #include @@ -130,14 +130,16 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Checking vs ref on CPU, this eps is about 10^-9 typedef typename ats::mag_type mag_type; - const mag_type eps = 1.0e7 * ats::epsilon(); + const mag_type eps = 2.0e7 * ats::epsilon(); bool test_flag = true; for (int i = 0; i < N; i++) { if (ats::abs(h_B(i) - h_X0(i)) > eps) { test_flag = false; - // printf( " Error %d, pivot %c, padding %c: result( %.15lf ) != - // solution( %.15lf ) at (%d)\n", N, mode[0], padding[0], - // ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i) ); + printf( + " Error %d, pivot %c, padding %c: result( %.15lf ) !=" + "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", + N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), + ats::abs(h_B(i) - h_X0(i)), eps); // break; } } @@ -425,4 +427,4 @@ TEST_F(TestCategory, gesv_mrhs_complex_float) { } #endif -#endif // CUDA+MAGMA or HIP+ROCSOLVER or LAPACK+HOST +#endif // CUDA+(MAGMA or CUSOLVER) or HIP+ROCSOLVER or LAPACK+HOST diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 3a8079dc66..fda38735a0 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -91,7 +91,7 @@ print_help() { echo "--with-tpls=TPLS: set KOKKOSKERNELS_ENABLE_TPLS" echo " Provide a comma-separated list of TPLs" echo " Valid items:" - echo " blas, mkl, cublas, cusparse, magma, armpl, rocblas, rocsparse, rocsolver" + echo " blas, mkl, cublas, cusparse, cusolver, magma, armpl, rocblas, rocsparse, rocsolver" echo "" echo "ARGS: list of expressions matching compilers to test" @@ -1083,7 +1083,7 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # device tpls if [[ "$compiler" == cuda* ]]; then - NEW_TPL_LIST="cublas,cusparse," + NEW_TPL_LIST="cublas,cusparse,cusolver," export KOKKOS_CUDA_OPTIONS="${KOKKOS_CUDA_OPTIONS},enable_lambda" fi if [[ "$compiler" == rocm* ]]; then From 55433b9cca560d94e60f8d07cf06220c15d71791 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 16 Nov 2023 13:59:56 -0700 Subject: [PATCH 072/326] Lapack: updating logic in cm_generate_makefile for cusolver There is some specific TPL logic in cm_generate_makefile and it cannot be found for cusolver, changing that might to the trick! --- cm_generate_makefile.bash | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 3358ae2eb8..426827db00 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -178,6 +178,7 @@ get_kernels_tpls_list() { KOKKOSKERNELS_USER_TPL_LIBNAME_CMD= CUBLAS_DEFAULT=OFF CUSPARSE_DEFAULT=OFF + CUSOLVER_DEFAULT=OFF ROCBLAS_DEFAULT=OFF ROCSPARSE_DEFAULT=OFF PARSE_TPLS_LIST=$(echo $KOKKOSKERNELS_TPLS | tr "," "\n") @@ -191,6 +192,9 @@ get_kernels_tpls_list() { if [ "$UC_TPLS" == "CUSPARSE" ]; then CUSPARSE_DEFAULT=ON fi + if [ "$UC_TPLS" == "CUSOLVER" ]; then + CUSOLVER_DEFAULT=ON + fi if [ "$UC_TPLS" == "ROCBLAS" ]; then ROCBLAS_DEFAULT=ON fi @@ -224,6 +228,9 @@ get_kernels_tpls_list() { if [ "$CUSPARSE_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi + if [ "$CUSOLVER_DEFAULT" == "OFF" ]; then + KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF ${KOKKOSKERNELS_TPLS_CMD}" + fi if [ "$ROCBLAS_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi From af6aeca5f0d9807d4f6a6cb397e00e0e196b932c Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 19 Nov 2023 20:35:39 -0700 Subject: [PATCH 073/326] Backup --- blas/unit_test/Test_Blas2_syr2.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 76b2cf43c1..11029b8778 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -207,8 +207,8 @@ Syr2Tester::value ? 1.0e-6 : 1.0e-9), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -610,7 +610,7 @@ Syr2Tester::value ? 1 : 1.1; for (int i = 0; i < _M; ++i) { _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); From 5188b71db807f303f9aeed9952e1e4d07c426799 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 20 Nov 2023 00:45:41 -0700 Subject: [PATCH 074/326] Backup --- .../Test_Blas1_axpby_unification.hpp | 631 ++++++++++++++---- 1 file changed, 502 insertions(+), 129 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 6b2e5a3f5c..447924a5a7 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -80,9 +80,10 @@ template void impl_test_axpby_unification_compare( tA const& a, tX const& x, tB const& b, tY const& y, int N, + bool testWithNanY, typename Kokkos::ArithTraits::mag_type const max_val, typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = 0, tScalarB const inputValueB = 0) { + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { using ScalarTypeX = typename std::remove_const::type; using ScalarTypeY = @@ -101,13 +102,18 @@ void impl_test_axpby_unification_compare( { ScalarTypeY randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + if (testWithNanY) { + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); + } + else { + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + } } tY org_y("Org_Y", N); Kokkos::deep_copy(org_y.h_view, y.d_view); - tScalarA valueA(0); - tScalarB valueB(0); + tScalarA valueA(Kokkos::ArithTraits::zero()); + tScalarB valueB(Kokkos::ArithTraits::zero()); if constexpr (std::is_same_v) { valueA = a; @@ -181,10 +187,44 @@ void impl_test_axpby_unification_compare( Kokkos::deep_copy(y.h_view, y.d_view); - for (int i(0); i < N; ++i) { - EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + - valueB * org_y.h_view(i)), - y.h_view(i), 2. * max_error); + if (testWithNanY == false) { + for (int i(0); i < N; ++i) { + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + + valueB * org_y.h_view(i)), + y.h_view(i), 2. * max_error); + } + } + else { + // ******************************************************** + // Tests with 'Y == nan()' are called only for cases where + // b == Kokkos::ArithTraits::zero() + // ******************************************************** + for (int i(0); i < N; ++i) { +#if 0 + ScalarTypeY tmp = static_cast(valueA * x.h_view(i) + valueB * org_y.h_view(i)); + std::cout << "i = " << i + << ", valueA = " << valueA + << ", x.h_view(i) = " << x.h_view(i) + << ", valueB = " << valueB + << ", org_y.h_view(i) = " << org_y.h_view(i) + << ", tmp = " << tmp + << ", y.h_view(i) = " << y.h_view(i) + << std::endl; +#endif + if constexpr (std::is_same_v) { + // **************************************************************** + // 'nan()' converts to '-1' in case of 'int' => no need to compare + // **************************************************************** + if (y.h_view(i) != -1) { + EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); + } + } + else { + EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); + } + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), + y.h_view(i), 2. * max_error); + } } } @@ -192,9 +232,10 @@ template void impl_test_axpby_mv_unification_compare( tA const& a, tX const& x, tB const& b, tY const& y, int N, int K, + bool testWithNanY, typename Kokkos::ArithTraits::mag_type const max_val, typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = 0, tScalarB const inputValueB = 0) { + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { using ScalarTypeX = typename std::remove_const::type; using ScalarTypeY = @@ -213,7 +254,12 @@ void impl_test_axpby_mv_unification_compare( { ScalarTypeY randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + if (testWithNanY) { + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); + } + else { + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + } } tY org_y("Org_Y", N, K); Kokkos::deep_copy(org_y.h_view, y.d_view); @@ -232,8 +278,8 @@ void impl_test_axpby_mv_unification_compare( Kokkos::deep_copy(b.h_view, b.d_view); } - tScalarA valueA(0); - tScalarB valueB(0); + tScalarA valueA(Kokkos::ArithTraits::zero()); + tScalarB valueB(Kokkos::ArithTraits::zero()); if constexpr (std::is_same_v) { valueA = a; if constexpr (std::is_same_v) { @@ -302,36 +348,97 @@ void impl_test_axpby_mv_unification_compare( Kokkos::deep_copy(y.h_view, y.d_view); - for (int i(0); i < N; ++i) { - for (int k(0); k < K; ++k) { - ScalarTypeY vanillaValue(0.); - if constexpr (aIsRank1) { - (void)valueA; // Avoid "set but not used" error - if constexpr (bIsRank1) { - (void)valueB; // Avoid "set but not used" error - int a_k(a.h_view.extent(0) == 1 ? 0 : k); - int b_k(b.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k) + - b.h_view(b_k) * org_y.h_view(i, k)); + if (testWithNanY == false) { + for (int i(0); i < N; ++i) { + for (int k(0); k < K; ++k) { + ScalarTypeY vanillaValue(Kokkos::ArithTraits::zero()); + if constexpr (aIsRank1) { + (void)valueA; // Avoid "set but not used" error + if constexpr (bIsRank1) { + (void)valueB; // Avoid "set but not used" error + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + int b_k(b.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k) + + b.h_view(b_k) * org_y.h_view(i, k)); + } else { + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = static_cast( + a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + } } else { - int a_k(a.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + if constexpr (bIsRank1) { + (void)valueB; // Avoid "set but not used" error + int b_k(b.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = static_cast( + valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + } else { + vanillaValue = static_cast(valueA * x.h_view(i, k) + + valueB * org_y.h_view(i, k)); + } } - } else { - if constexpr (bIsRank1) { - (void)valueB; // Avoid "set but not used" error - int b_k(b.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); + } + } + } + else { + // ******************************************************** + // Tests with 'Y == nan()' are called only for cases where + // b == Kokkos::ArithTraits::zero() + // ******************************************************** + for (int i(0); i < N; ++i) { + for (int k(0); k < K; ++k) { + ScalarTypeY vanillaValue(Kokkos::ArithTraits::zero()); + if constexpr (aIsRank1) { + (void)valueA; // Avoid "set but not used" error + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k)); +#if 1 + ScalarTypeY tmp = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + std::cout << "i = " << i + << ", k = " << k + << ", a_k = " << a_k + << ", a.h_view(a_k) = " << a.h_view(a_k) + << ", x.h_view(i, k) = " << x.h_view(i, k) + << ", valueB = " << valueB + << ", org_y.h_view(i, k) = " << org_y.h_view(i, k) + << ", tmp = " << tmp + << ", vanillaValue = " << vanillaValue + << ", y.h_view(i, k) = " << y.h_view(i, k) + << std::endl; +#endif } else { - vanillaValue = static_cast(valueA * x.h_view(i, k) + - valueB * org_y.h_view(i, k)); + vanillaValue = static_cast(valueA * x.h_view(i, k)); +#if 1 + ScalarTypeY tmp = static_cast(valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + std::cout << "i = " << i + << ", k = " << k + << ", valueA = " << valueA + << ", x.h_view(i, k) = " << x.h_view(i, k) + << ", valueB = " << valueB + << ", org_y.h_view(i, k) = " << org_y.h_view(i, k) + << ", tmp = " << tmp + << ", vanillaValue = " << vanillaValue + << ", y.h_view(i, k) = " << y.h_view(i, k) + << std::endl; +#endif } - } - EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); + if constexpr (std::is_same_v) { + // **************************************************************** + // 'nan()' converts to '-1' in case of 'int' => no need to compare + // **************************************************************** + if (y.h_view(i, k) != -1) { + EXPECT_NE(y.h_view(i, k), Kokkos::ArithTraits::nan()); + } + } + else { + EXPECT_NE(y.h_view(i, k), Kokkos::ArithTraits::nan()); + } + + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); + } } } } @@ -352,8 +459,8 @@ void impl_test_axpby_unification(int const N) { using ViewTypeY = Kokkos::View; - std::array const valuesA{-1, 0, 1, 3}; - std::array const valuesB{-1, 0, 1, 5}; + std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -385,7 +492,13 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, max_val, max_error); + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } } } } @@ -415,7 +528,13 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, max_val, max_error); + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } } } } @@ -440,7 +559,13 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, max_val, max_error); + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } } } } @@ -464,7 +589,13 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, max_val, max_error); + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } } } } @@ -491,7 +622,13 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, max_val, max_error); + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } } } } @@ -520,7 +657,13 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, max_val, max_error); + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } } } } @@ -548,8 +691,15 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, max_val, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); + } } } } @@ -577,7 +727,13 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, max_val, max_error); + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } } } } @@ -602,8 +758,15 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, max_val, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); + } } } } @@ -630,8 +793,15 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, max_val, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); + } } } } @@ -657,7 +827,14 @@ void impl_test_axpby_unification(int const N) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, max_val, max_error); + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } } } } @@ -682,7 +859,14 @@ void impl_test_axpby_unification(int const N) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, max_val, max_error); + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } } } } @@ -706,8 +890,15 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, max_val, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); + } } } } @@ -734,8 +925,15 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, max_val, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); + } } } } @@ -761,7 +959,14 @@ void impl_test_axpby_unification(int const N) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, max_val, max_error); + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } } } } @@ -786,7 +991,14 @@ void impl_test_axpby_unification(int const N) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, max_val, max_error); + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } } } } @@ -819,8 +1031,8 @@ void impl_test_axpby_mv_unification(int const N, int const K) { using ViewTypeY = Kokkos::View; - std::array const valuesA{-1, 0, 1, 3}; - std::array const valuesB{-1, 0, 1, 5}; + std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -836,7 +1048,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 01/36: Ascalar + Bscalar // ************************************************************ - // std::cout << "Starting case 01/36" << std::endl; + std::cout << "Starting case 01/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -852,7 +1064,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, K, max_val, max_error); + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -860,7 +1078,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 02/36: Ascalar + Br0 // ************************************************************ - // std::cout << "Starting case 02/36" << std::endl; + std::cout << "Starting case 02/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -879,7 +1097,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, K, max_val, max_error); + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -888,7 +1112,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 03/36: Ascalar + Br1s_1 // ************************************************************ - // std::cout << "Starting case 03/36" << std::endl; + std::cout << "Starting case 03/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -904,7 +1128,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -912,7 +1142,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 04/36: Ascalar + Br1s_k // ************************************************************ - // std::cout << "Starting case 04/36" << std::endl; + std::cout << "Starting case 04/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -938,7 +1168,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -946,7 +1176,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 05/36: Ascalar + Br1d,1 // ************************************************************ - // std::cout << "Starting case 05/36" << std::endl; + std::cout << "Starting case 05/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -962,7 +1192,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -970,7 +1206,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 06/36: Ascalar + Br1d,k // ************************************************************ - // std::cout << "Starting case 06/36" << std::endl; + std::cout << "Starting case 06/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -996,7 +1232,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1004,7 +1240,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 07/36: Ar0 + Bscalar // ************************************************************w - // std::cout << "Starting case 07/36" << std::endl; + std::cout << "Starting case 07/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1023,7 +1259,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, K, max_val, max_error); + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1032,7 +1274,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 08/36: Ar0 + Br0 // ************************************************************ - // std::cout << "Starting case 08/36" << std::endl; + std::cout << "Starting case 08/36" << std::endl; if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors @@ -1052,7 +1294,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, K, max_val, max_error); + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1061,7 +1309,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 09/36: Ar0 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 09/36" << std::endl; + std::cout << "Starting case 09/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1080,8 +1328,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1090,7 +1345,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 10/36: Ar0 + Br1s_k // ************************************************************ - // std::cout << "Starting case 10/36" << std::endl; + std::cout << "Starting case 10/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1119,7 +1374,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); } } @@ -1129,7 +1384,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 11/36: Ar0 + Br1d,1 // ************************************************************ - // std::cout << "Starting case 11/36" << std::endl; + std::cout << "Starting case 11/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1148,7 +1403,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1157,7 +1418,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 12/36: Ar0 + Br1d,k // ************************************************************ - // std::cout << "Starting case 12/36" << std::endl; + std::cout << "Starting case 12/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1186,7 +1447,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1195,7 +1456,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 13/36: Ar1s_1 + Bscalar // ************************************************************w - // std::cout << "Starting case 13/36" << std::endl; + std::cout << "Starting case 13/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1211,8 +1472,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1220,7 +1488,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 14/36: Ar1s_1 + Br0 // ************************************************************ - // std::cout << "Starting case 14/36" << std::endl; + std::cout << "Starting case 14/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1239,8 +1507,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1249,7 +1524,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 15/36: Ar1s_1 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 15/36" << std::endl; + std::cout << "Starting case 15/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1266,7 +1541,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1274,7 +1556,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 16/36: Ar1s_1 + Br1s_k // ************************************************************ - // std::cout << "Starting case 16/36" << std::endl; + std::cout << "Starting case 16/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1301,7 +1583,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1309,7 +1591,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 17/36: Ar1s_1 + Br1d,1 // ************************************************************ - // std::cout << "Starting case 17/36" << std::endl; + std::cout << "Starting case 17/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1326,7 +1608,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1334,7 +1623,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 18/36: Ar1s_1 + Br1d,k // ************************************************************ - // std::cout << "Starting case 18/36" << std::endl; + std::cout << "Starting case 18/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1361,7 +1650,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1369,7 +1658,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 19/36: Ar1s_k + Bscalar // ************************************************************ - // std::cout << "Starting case 19/36" << std::endl; + std::cout << "Starting case 19/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1395,8 +1684,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1404,7 +1700,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 20/36: Ar1s_k + Br0 // ************************************************************ - // std::cout << "Starting case 20/36" << std::endl; + std::cout << "Starting case 20/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1433,8 +1729,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1443,7 +1746,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 21/36: Ar1s_k + Br1s_1 // ************************************************************ - // std::cout << "Starting case 21/36" << std::endl; + std::cout << "Starting case 21/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1470,7 +1773,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1478,7 +1788,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 22/36: Ar1s_k + Br1s_k // ************************************************************ - // std::cout << "Starting case 22/36" << std::endl; + std::cout << "Starting case 22/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1516,7 +1826,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1524,7 +1834,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 23/36: Ar1s_k + Br1d,1 // ************************************************************ - // std::cout << "Starting case 23/36" << std::endl; + std::cout << "Starting case 23/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1551,7 +1861,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1559,7 +1876,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 24/36: Ar1s_k + Br1d,k // ************************************************************ - // std::cout << "Starting case 24/36" << std::endl; + std::cout << "Starting case 24/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1598,7 +1915,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1606,7 +1923,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 25/36: Ar1d,1 + Bscalar // ************************************************************w - // std::cout << "Starting case 25/36" << std::endl; + std::cout << "Starting case 25/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1622,8 +1939,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1631,7 +1955,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 26/36: Ar1d,1 + Br0 // ************************************************************ - // std::cout << "Starting case 26/36" << std::endl; + std::cout << "Starting case 26/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1650,8 +1974,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1660,7 +1991,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 27/36: Ar1d,1 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 27/36" << std::endl; + std::cout << "Starting case 27/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1677,7 +2008,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1685,7 +2023,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 28/36: Ar1d,1 + Br1s_k // ************************************************************ - // std::cout << "Starting case 28/36" << std::endl; + std::cout << "Starting case 28/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1712,7 +2050,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1720,7 +2058,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 29/36: Ar1d,1 + Br1d,1 // ************************************************************ - // std::cout << "Starting case 29/36" << std::endl; + std::cout << "Starting case 29/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1737,7 +2075,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1745,7 +2090,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 30/36: Ar1d,1 + Br1d,k // ************************************************************ - // std::cout << "Starting case 30/36" << std::endl; + std::cout << "Starting case 30/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1772,7 +2117,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1780,7 +2125,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 31/36: Ar1d,k + Bscalar // ************************************************************w - // std::cout << "Starting case 31/36" << std::endl; + std::cout << "Starting case 31/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1806,8 +2151,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1815,7 +2167,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 32/36: Ar1d,k + Br0 // ************************************************************ - // std::cout << "Starting case 32/36" << std::endl; + std::cout << "Starting case 32/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1844,8 +2196,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, max_val, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, + max_error); + } } } } @@ -1854,7 +2213,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 33/36: Ar1d,k + Br1s_1 // ************************************************************ - // std::cout << "Starting case 33/36" << std::endl; + std::cout << "Starting case 33/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1881,7 +2240,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1889,7 +2255,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 34/36: Ar1d,k + Br1s_k // ************************************************************ - // std::cout << "Starting case 34/36" << std::endl; + std::cout << "Starting case 34/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1928,7 +2294,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1936,7 +2302,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 35/36: Ar1d,k + Br1d,1 // ************************************************************ - // std::cout << "Starting case 35/36" << std::endl; + std::cout << "Starting case 35/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1963,7 +2329,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } } } } @@ -1971,7 +2344,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 36/36: Ar1d,k + Br1d,k // ************************************************************ - // std::cout << "Starting case 36/36" << std::endl; + std::cout << "Starting case 36/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2010,7 +2383,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, max_val, max_error); + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } From ee23cf735665d37e3d27e53bb55456faa80a20b8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 20 Nov 2023 01:49:21 -0700 Subject: [PATCH 075/326] Backup --- blas/impl/KokkosBlas1_axpby_impl.hpp | 28 ++++- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 110 ++++++++++++++---- .../Test_Blas1_axpby_unification.hpp | 76 ++++++------ 3 files changed, 152 insertions(+), 62 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index 29a72c19d5..dfed515368 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -123,7 +123,12 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { // Nothing to do: m_y(i) = m_y(i); } else if constexpr (scalar_y == 2) { - m_y(i) = m_b(0) * m_y(i); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + m_y(i) = Kokkos::ArithTraits::zero(); + } + else { + m_y(i) = m_b(0) * m_y(i); + } } } // ************************************************************** @@ -137,7 +142,12 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = -m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - m_y(i) = -m_x(i) + m_b(0) * m_y(i); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + m_y(i) = -m_x(i); + } + else { + m_y(i) = -m_x(i) + m_b(0) * m_y(i); + } } } // ************************************************************** @@ -151,7 +161,12 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - m_y(i) = m_x(i) + m_b(0) * m_y(i); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + m_y(i) = m_x(i); + } + else { + m_y(i) = m_x(i) + m_b(0) * m_y(i); + } } } // ************************************************************** @@ -165,7 +180,12 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_a(0) * m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + m_y(i) = m_a(0) * m_x(i); + } + else { + m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); + } } } } diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 4a9872d7d7..349a8ba8ef 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -129,8 +129,15 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = Kokkos::ArithTraits::zero(); + } + } + else { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -187,8 +194,15 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k); + } + } + else { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -245,8 +259,15 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k); + } + } + else { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -340,8 +361,15 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } + else { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -362,8 +390,15 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } + else { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -718,8 +753,15 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = Kokkos::ArithTraits::zero(); + } + } + else { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -761,8 +803,15 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k); + } + } + else { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -804,8 +853,15 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k); + } + } + else { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -875,8 +931,15 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } + else { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -891,8 +954,15 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } + else { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + } } } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 447924a5a7..ef18961645 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -394,7 +394,7 @@ void impl_test_axpby_mv_unification_compare( (void)valueA; // Avoid "set but not used" error int a_k(a.h_view.extent(0) == 1 ? 0 : k); vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k)); -#if 1 +#if 0 ScalarTypeY tmp = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); std::cout << "i = " << i << ", k = " << k @@ -410,7 +410,7 @@ void impl_test_axpby_mv_unification_compare( #endif } else { vanillaValue = static_cast(valueA * x.h_view(i, k)); -#if 1 +#if 0 ScalarTypeY tmp = static_cast(valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); std::cout << "i = " << i << ", k = " << k @@ -1048,7 +1048,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 01/36: Ascalar + Bscalar // ************************************************************ - std::cout << "Starting case 01/36" << std::endl; + // std::cout << "Starting case 01/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1078,7 +1078,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 02/36: Ascalar + Br0 // ************************************************************ - std::cout << "Starting case 02/36" << std::endl; + // std::cout << "Starting case 02/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1112,7 +1112,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 03/36: Ascalar + Br1s_1 // ************************************************************ - std::cout << "Starting case 03/36" << std::endl; + // std::cout << "Starting case 03/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1142,7 +1142,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 04/36: Ascalar + Br1s_k // ************************************************************ - std::cout << "Starting case 04/36" << std::endl; + // std::cout << "Starting case 04/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1176,7 +1176,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 05/36: Ascalar + Br1d,1 // ************************************************************ - std::cout << "Starting case 05/36" << std::endl; + // std::cout << "Starting case 05/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1206,7 +1206,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 06/36: Ascalar + Br1d,k // ************************************************************ - std::cout << "Starting case 06/36" << std::endl; + // std::cout << "Starting case 06/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1240,7 +1240,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 07/36: Ar0 + Bscalar // ************************************************************w - std::cout << "Starting case 07/36" << std::endl; + // std::cout << "Starting case 07/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1274,7 +1274,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 08/36: Ar0 + Br0 // ************************************************************ - std::cout << "Starting case 08/36" << std::endl; + // std::cout << "Starting case 08/36" << std::endl; if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors @@ -1309,7 +1309,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 09/36: Ar0 + Br1s_1 // ************************************************************ - std::cout << "Starting case 09/36" << std::endl; + // std::cout << "Starting case 09/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1345,7 +1345,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 10/36: Ar0 + Br1s_k // ************************************************************ - std::cout << "Starting case 10/36" << std::endl; + // std::cout << "Starting case 10/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1384,7 +1384,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 11/36: Ar0 + Br1d,1 // ************************************************************ - std::cout << "Starting case 11/36" << std::endl; + // std::cout << "Starting case 11/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1418,7 +1418,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 12/36: Ar0 + Br1d,k // ************************************************************ - std::cout << "Starting case 12/36" << std::endl; + // std::cout << "Starting case 12/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1456,7 +1456,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 13/36: Ar1s_1 + Bscalar // ************************************************************w - std::cout << "Starting case 13/36" << std::endl; + // std::cout << "Starting case 13/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1488,7 +1488,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 14/36: Ar1s_1 + Br0 // ************************************************************ - std::cout << "Starting case 14/36" << std::endl; + // std::cout << "Starting case 14/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1524,7 +1524,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 15/36: Ar1s_1 + Br1s_1 // ************************************************************ - std::cout << "Starting case 15/36" << std::endl; + // std::cout << "Starting case 15/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1556,7 +1556,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 16/36: Ar1s_1 + Br1s_k // ************************************************************ - std::cout << "Starting case 16/36" << std::endl; + // std::cout << "Starting case 16/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1591,7 +1591,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 17/36: Ar1s_1 + Br1d,1 // ************************************************************ - std::cout << "Starting case 17/36" << std::endl; + // std::cout << "Starting case 17/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1623,7 +1623,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 18/36: Ar1s_1 + Br1d,k // ************************************************************ - std::cout << "Starting case 18/36" << std::endl; + // std::cout << "Starting case 18/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1658,7 +1658,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 19/36: Ar1s_k + Bscalar // ************************************************************ - std::cout << "Starting case 19/36" << std::endl; + // std::cout << "Starting case 19/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1700,7 +1700,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 20/36: Ar1s_k + Br0 // ************************************************************ - std::cout << "Starting case 20/36" << std::endl; + // std::cout << "Starting case 20/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1746,7 +1746,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 21/36: Ar1s_k + Br1s_1 // ************************************************************ - std::cout << "Starting case 21/36" << std::endl; + // std::cout << "Starting case 21/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1788,7 +1788,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 22/36: Ar1s_k + Br1s_k // ************************************************************ - std::cout << "Starting case 22/36" << std::endl; + // std::cout << "Starting case 22/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1834,7 +1834,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 23/36: Ar1s_k + Br1d,1 // ************************************************************ - std::cout << "Starting case 23/36" << std::endl; + // std::cout << "Starting case 23/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1876,7 +1876,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 24/36: Ar1s_k + Br1d,k // ************************************************************ - std::cout << "Starting case 24/36" << std::endl; + // std::cout << "Starting case 24/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1923,7 +1923,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 25/36: Ar1d,1 + Bscalar // ************************************************************w - std::cout << "Starting case 25/36" << std::endl; + // std::cout << "Starting case 25/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1955,7 +1955,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 26/36: Ar1d,1 + Br0 // ************************************************************ - std::cout << "Starting case 26/36" << std::endl; + // std::cout << "Starting case 26/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1991,7 +1991,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 27/36: Ar1d,1 + Br1s_1 // ************************************************************ - std::cout << "Starting case 27/36" << std::endl; + // std::cout << "Starting case 27/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2023,7 +2023,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 28/36: Ar1d,1 + Br1s_k // ************************************************************ - std::cout << "Starting case 28/36" << std::endl; + // std::cout << "Starting case 28/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2058,7 +2058,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 29/36: Ar1d,1 + Br1d,1 // ************************************************************ - std::cout << "Starting case 29/36" << std::endl; + // std::cout << "Starting case 29/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2090,7 +2090,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 30/36: Ar1d,1 + Br1d,k // ************************************************************ - std::cout << "Starting case 30/36" << std::endl; + // std::cout << "Starting case 30/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2125,7 +2125,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 31/36: Ar1d,k + Bscalar // ************************************************************w - std::cout << "Starting case 31/36" << std::endl; + // std::cout << "Starting case 31/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2167,7 +2167,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 32/36: Ar1d,k + Br0 // ************************************************************ - std::cout << "Starting case 32/36" << std::endl; + // std::cout << "Starting case 32/36" << std::endl; if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -2213,7 +2213,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 33/36: Ar1d,k + Br1s_1 // ************************************************************ - std::cout << "Starting case 33/36" << std::endl; + // std::cout << "Starting case 33/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2255,7 +2255,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 34/36: Ar1d,k + Br1s_k // ************************************************************ - std::cout << "Starting case 34/36" << std::endl; + // std::cout << "Starting case 34/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2302,7 +2302,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 35/36: Ar1d,k + Br1d,1 // ************************************************************ - std::cout << "Starting case 35/36" << std::endl; + // std::cout << "Starting case 35/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2344,7 +2344,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 36/36: Ar1d,k + Br1d,k // ************************************************************ - std::cout << "Starting case 36/36" << std::endl; + // std::cout << "Starting case 36/36" << std::endl; for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { From 931d8b43f430de241a4f4b2dc109f150dc7d5b86 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 20 Nov 2023 02:22:37 -0700 Subject: [PATCH 076/326] Formatting --- blas/impl/KokkosBlas1_axpby_impl.hpp | 27 +-- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 66 +++--- .../Test_Blas1_axpby_unification.hpp | 211 +++++++++--------- blas/unit_test/Test_Blas2_syr2.hpp | 8 +- 4 files changed, 165 insertions(+), 147 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index dfed515368..b919d76a94 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -123,10 +123,11 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { // Nothing to do: m_y(i) = m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == Kokkos::ArithTraits::zero()) { - m_y(i) = Kokkos::ArithTraits::zero(); - } - else { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { + m_y(i) = + Kokkos::ArithTraits::zero(); + } else { m_y(i) = m_b(0) * m_y(i); } } @@ -142,10 +143,10 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = -m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { m_y(i) = -m_x(i); - } - else { + } else { m_y(i) = -m_x(i) + m_b(0) * m_y(i); } } @@ -161,10 +162,10 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { m_y(i) = m_x(i); - } - else { + } else { m_y(i) = m_x(i) + m_b(0) * m_y(i); } } @@ -180,10 +181,10 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_a(0) * m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { m_y(i) = m_a(0) * m_x(i); - } - else { + } else { m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); } } diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 349a8ba8ef..e81728d3ba 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -129,12 +129,13 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = Kokkos::ArithTraits::zero(); + m_y(i, k) = Kokkos::ArithTraits< + typename YMV::non_const_value_type>::zero(); } - } - else { + } else { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_b(0) * m_y(i, k); } @@ -194,12 +195,12 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_x(i, k); } - } - else { + } else { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); } @@ -259,12 +260,12 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_x(i, k); } - } - else { + } else { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); } @@ -361,12 +362,12 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(0) * m_x(i, k); } - } - else { + } else { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); } @@ -390,12 +391,12 @@ struct Axpby_MV_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(k) * m_x(i, k); } - } - else { + } else { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); } @@ -753,12 +754,13 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = Kokkos::ArithTraits::zero(); + m_y(i, k) = Kokkos::ArithTraits< + typename YMV::non_const_value_type>::zero(); } - } - else { + } else { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_b(0) * m_y(i, k); } @@ -803,12 +805,12 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_x(i, k); } - } - else { + } else { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); } @@ -853,12 +855,12 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_x(i, k); } - } - else { + } else { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); } @@ -931,12 +933,12 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(0) * m_x(i, k); } - } - else { + } else { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); } @@ -954,12 +956,12 @@ struct Axpby_MV_Unroll_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(k) * m_x(i, k); } - } - else { + } else { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); } diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index ef18961645..0457527718 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -83,7 +83,8 @@ void impl_test_axpby_unification_compare( bool testWithNanY, typename Kokkos::ArithTraits::mag_type const max_val, typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = Kokkos::ArithTraits::zero(), tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { using ScalarTypeX = typename std::remove_const::type; using ScalarTypeY = @@ -104,8 +105,7 @@ void impl_test_axpby_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); if (testWithNanY) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); - } - else { + } else { Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } } @@ -193,8 +193,7 @@ void impl_test_axpby_unification_compare( valueB * org_y.h_view(i)), y.h_view(i), 2. * max_error); } - } - else { + } else { // ******************************************************** // Tests with 'Y == nan()' are called only for cases where // b == Kokkos::ArithTraits::zero() @@ -218,8 +217,7 @@ void impl_test_axpby_unification_compare( if (y.h_view(i) != -1) { EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); } - } - else { + } else { EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); } EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), @@ -235,7 +233,8 @@ void impl_test_axpby_mv_unification_compare( bool testWithNanY, typename Kokkos::ArithTraits::mag_type const max_val, typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = Kokkos::ArithTraits::zero(), tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { using ScalarTypeX = typename std::remove_const::type; using ScalarTypeY = @@ -256,8 +255,7 @@ void impl_test_axpby_mv_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); if (testWithNanY) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); - } - else { + } else { Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } } @@ -373,16 +371,15 @@ void impl_test_axpby_mv_unification_compare( vanillaValue = static_cast( valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); } else { - vanillaValue = static_cast(valueA * x.h_view(i, k) + - valueB * org_y.h_view(i, k)); + vanillaValue = static_cast( + valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } } EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); } } - } - else { + } else { // ******************************************************** // Tests with 'Y == nan()' are called only for cases where // b == Kokkos::ArithTraits::zero() @@ -393,7 +390,8 @@ void impl_test_axpby_mv_unification_compare( if constexpr (aIsRank1) { (void)valueA; // Avoid "set but not used" error int a_k(a.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k)); + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k)); #if 0 ScalarTypeY tmp = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); std::cout << "i = " << i @@ -432,11 +430,10 @@ void impl_test_axpby_mv_unification_compare( if (y.h_view(i, k) != -1) { EXPECT_NE(y.h_view(i, k), Kokkos::ArithTraits::nan()); } - } - else { + } else { EXPECT_NE(y.h_view(i, k), Kokkos::ArithTraits::nan()); } - + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); } } @@ -459,8 +456,10 @@ void impl_test_axpby_unification(int const N) { using ViewTypeY = Kokkos::View; - std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; - std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; + std::array const valuesA{ + -1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{ + -1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -563,8 +562,9 @@ void impl_test_axpby_unification(int const N) { if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -691,14 +691,14 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -731,8 +731,9 @@ void impl_test_axpby_unification(int const N) { if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -758,14 +759,14 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -793,14 +794,14 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -832,8 +833,9 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -890,14 +892,14 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -925,14 +927,14 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -964,8 +966,9 @@ void impl_test_axpby_unification(int const N) { impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -1031,8 +1034,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { using ViewTypeY = Kokkos::View; - std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; - std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; + std::array const valuesA{ + -1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{ + -1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -1132,8 +1137,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1328,14 +1334,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1374,8 +1380,8 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -1407,8 +1413,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1472,14 +1479,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1507,14 +1514,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1546,8 +1553,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1684,14 +1692,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1729,14 +1737,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1778,8 +1786,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1939,14 +1948,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1974,14 +1983,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2013,8 +2022,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2151,14 +2161,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2196,14 +2206,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, max_val, - max_error); + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2245,8 +2255,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 11029b8778..a3b53129fe 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -207,8 +207,12 @@ Syr2Tester::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value + ? 1.0e-6 + : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value + ? 5.0e-3 + : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), From 1624ffd2f698f5eb3ce435f80a9dde0a5878d6cb Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 20 Nov 2023 18:09:16 -0700 Subject: [PATCH 077/326] mv_unification tests with double are failing by very small amounts, e.g. 5.9e-14 vs. 3.6e-14 --- blas/unit_test/Test_Blas1_axpby_unification.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 0457527718..73ad9653c7 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -1043,7 +1043,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // in which the result is computed. using MagnitudeB = typename Kokkos::ArithTraits::mag_type; MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); - MagnitudeB const max_val = 10; + MagnitudeB const max_val = 20; MagnitudeB const max_error = static_cast( Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + From af49d606f23bbdc2f56a5361002be0ca0703e447 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 21 Nov 2023 02:03:34 -0700 Subject: [PATCH 078/326] Trying one more increment on tolerance --- blas/unit_test/Test_Blas1_axpby_unification.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 73ad9653c7..c75a4138cd 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -1043,7 +1043,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // in which the result is computed. using MagnitudeB = typename Kokkos::ArithTraits::mag_type; MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); - MagnitudeB const max_val = 20; + MagnitudeB const max_val = 40; MagnitudeB const max_error = static_cast( Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + From 091b3ab0899b08a77d1de257c0cb43ab9c14cebd Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 21 Nov 2023 08:46:49 -0700 Subject: [PATCH 079/326] Putting pragma's and unrolls properly right before for loops (compilation warning at weaver) --- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 85 ++++++++++++++----- .../Test_Blas1_axpby_unification.hpp | 2 +- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index e81728d3ba..7db7b0abe3 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -123,19 +123,25 @@ struct Axpby_MV_Functor { // Nothing to do: Y(i,j) := Y(i,j) } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == - Kokkos::ArithTraits::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = Kokkos::ArithTraits< typename YMV::non_const_value_type>::zero(); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_b(0) * m_y(i, k); } @@ -189,18 +195,24 @@ struct Axpby_MV_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == - Kokkos::ArithTraits::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); } @@ -254,18 +266,24 @@ struct Axpby_MV_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == - Kokkos::ArithTraits::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); } @@ -356,18 +374,24 @@ struct Axpby_MV_Functor { } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(0) * m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); } @@ -385,18 +409,24 @@ struct Axpby_MV_Functor { } } else { if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(k) * m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif for (size_type k = 0; k < numCols; ++k) { m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); } @@ -751,16 +781,19 @@ struct Axpby_MV_Unroll_Functor { // Nothing to do: Y(i,j) := Y(i,j) } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == - Kokkos::ArithTraits::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = Kokkos::ArithTraits< typename YMV::non_const_value_type>::zero(); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_b(0) * m_y(i, k); } @@ -802,15 +835,18 @@ struct Axpby_MV_Unroll_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == - Kokkos::ArithTraits::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); } @@ -852,15 +888,18 @@ struct Axpby_MV_Unroll_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == - Kokkos::ArithTraits::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); } @@ -930,15 +969,18 @@ struct Axpby_MV_Unroll_Functor { } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(0) * m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); } @@ -953,15 +995,18 @@ struct Axpby_MV_Unroll_Functor { } } else { if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(k) * m_x(i, k); } } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif for (int k = 0; k < UNROLL; ++k) { m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); } diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index c75a4138cd..0457527718 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -1043,7 +1043,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // in which the result is computed. using MagnitudeB = typename Kokkos::ArithTraits::mag_type; MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); - MagnitudeB const max_val = 40; + MagnitudeB const max_val = 10; MagnitudeB const max_error = static_cast( Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + From fc3d24a3a4056d825bc95d37df1f8c725b8a30f9 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 21 Nov 2023 11:22:41 -0700 Subject: [PATCH 080/326] Giving it another try to larger tolarance, after fixing the warning on pragma and unroll --- blas/unit_test/Test_Blas1_axpby_unification.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 0457527718..c75a4138cd 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -1043,7 +1043,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // in which the result is computed. using MagnitudeB = typename Kokkos::ArithTraits::mag_type; MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); - MagnitudeB const max_val = 10; + MagnitudeB const max_val = 40; MagnitudeB const max_error = static_cast( Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + From aed6a460de51f0cfd364815ed1f86f5d499a5b9c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 20 Nov 2023 17:36:24 -0700 Subject: [PATCH 081/326] Lapack: gesv, implementing review commments --- ...KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp | 2 +- .../src/KokkosKernels_PrintConfiguration.hpp | 1 + .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 152 +++++++++++------- lapack/unit_test/Test_Lapack_gesv.hpp | 10 +- 4 files changed, 97 insertions(+), 68 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp index e6dfef7c6d..869c065af2 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -163,7 +163,7 @@ namespace Impl { YViewType; \ typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ + Kokkos::MemoryTraits > \ AViewType; \ \ static void syr2(const typename AViewType::execution_space& space, \ diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index b5670568e0..c2e3a5187f 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -108,6 +108,7 @@ inline void print_enabled_tpls(std::ostream& os) { #endif print_cublas_version_if_enabled(os); print_cusparse_version_if_enabled(os); + print_cusolver_version_if_enabled(os); #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 228c42f323..82f7aea64a 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -76,28 +76,37 @@ void lapackGesvWrapper(const AViewType& A, const BViewType& B, } } -#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ +#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, EXECSPACE, MEM_SPACE) \ + template <> \ struct GESV< \ - ExecSpace, \ - Kokkos::View, \ + EXECSPACE, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ + true, \ + gesv_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ using AViewType = \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>; \ using BViewType = \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>; \ using PViewType = \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>; \ \ - static void gesv(const ExecSpace& /* space */, const AViewType& A, \ + static void gesv(const EXECSPACE& /* space */, const AViewType& A, \ const BViewType& B, const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR \ "]"); \ @@ -107,21 +116,27 @@ void lapackGesvWrapper(const AViewType& A, const BViewType& B, } \ }; -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace, false) - -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace, false) - +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) - + Kokkos::Serial, Kokkos::HostSpace) KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) + Kokkos::Serial, Kokkos::HostSpace) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) + Kokkos::OpenMP, Kokkos::HostSpace) +#endif } // namespace Impl } // namespace KokkosLapack @@ -390,7 +405,7 @@ KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA -// ROCSOLVER +// CUSOLVER #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER #include "KokkosLapack_cusolver.hpp" @@ -483,7 +498,7 @@ void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); } -#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ template <> \ struct GESV< \ Kokkos::Cuda, \ @@ -493,7 +508,17 @@ void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ + true, \ + gesv_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ using AViewType = Kokkos::View, \ Kokkos::MemoryTraits>; \ @@ -515,21 +540,21 @@ void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, } \ }; -KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - +KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - + Kokkos::CudaSpace) KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) + Kokkos::CudaSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) + Kokkos::CudaUVMSpace) +#endif } // namespace Impl } // namespace KokkosLapack @@ -590,28 +615,40 @@ void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ +#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ struct GESV< \ - ExecSpace, \ - Kokkos::View, \ + Kokkos::HIP, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ + true, \ + gesv_eti_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ using AViewType = \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>; \ using BViewType = \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>; \ using PViewType = Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>; \ \ - static void gesv(const ExecSpace& space, const AViewType& A, \ + static void gesv(const Kokkos::HIP& space, const AViewType& A, \ const BViewType& B, const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ "KokkosLapack::gesv[TPL_ROCSOLVER," #SCALAR "]"); \ @@ -622,21 +659,12 @@ void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, } \ }; -KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) - -KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) - -KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) +KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) + Kokkos::HIPSpace) KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) + Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 5796659183..318f9f06ae 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -135,11 +135,11 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { for (int i = 0; i < N; i++) { if (ats::abs(h_B(i) - h_X0(i)) > eps) { test_flag = false; - printf( - " Error %d, pivot %c, padding %c: result( %.15lf ) !=" - "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", - N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), - ats::abs(h_B(i) - h_X0(i)), eps); + // printf( + // " Error %d, pivot %c, padding %c: result( %.15lf ) !=" + // "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", + // N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), + // int(i), ats::abs(h_B(i) - h_X0(i)), eps); // break; } } From d232d2b8653686462b52be5977aa90497d78d5b4 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 8 Nov 2023 14:26:26 -0700 Subject: [PATCH 082/326] Adding Changelog for Release 4.2.0 (#2031) * Adding Changelog for Release 4.2.0 Part of Kokkos C++ Performance Portability Programming EcoSystem 4.2 * Formatting the changelog a bit more Mentioning more clearly LAPACK vs BLAS, grouping PRs by logical work unit, etc... * Remove minor revisions, improve text descriptions * Changelog: add spmv perftest detail --------- Co-authored-by: Luc Berger Co-authored-by: Carl Pearson Co-authored-by: brian-kelley --- CHANGELOG.md | 261 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d582fc354f..59c3f5a647 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,266 @@ # Change Log +## [4.2.00](https://github.com/kokkos/kokkos-kernels/tree/4.2.00) (2023-11-06) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.1.00...4.2.00) + +### New Features + +#### BLAS updates +- Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) + +#### LAPACK +- New component added for the implementation of LAPACK algorithms and to support associated TPLs [\#1985](https://github.com/kokkos/kokkos-kernels/pull/1985) +- Fix some issue with unit-test definition for SYCL backend in the new LAPACK component [\#2024](https://github.com/kokkos/kokkos-kernels/pull/2024) + +#### Sparse updates +- Extract diagonal blocks from a CRS matrix into separate CRS matrices [\#1947](https://github.com/kokkos/kokkos-kernels/pull/1947) +- Adding exec space instance to spmv [\#1932](https://github.com/kokkos/kokkos-kernels/pull/1932) +- Add merge-based SpMV [\#1911](https://github.com/kokkos/kokkos-kernels/pull/1911) +- Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (PSGS and Team_PSGS) [\#1906](https://github.com/kokkos/kokkos-kernels/pull/1906) +- Add a MergeMatrixDiagonal abstraction to KokkosSparse [\#1780](https://github.com/kokkos/kokkos-kernels/pull/1780) + +#### ODE updates +- Newton solver [\#1924](https://github.com/kokkos/kokkos-kernels/pull/1924) + +### Enhancements: + +#### Sparse +- MDF performance improvements exposing more parallelism in the implementation + - MDF: convert remaining count functor to hierarchical parallelism [\#1894](https://github.com/kokkos/kokkos-kernels/pull/1894) + - MDF: move most expensive kernels over to hierarchical parallelism [\#1893](https://github.com/kokkos/kokkos-kernels/pull/1893) +- Improvements to the Block Crs Matrix-Vector multiplication algorithm + - Improve BSR matrix SpMV Performance [\#1740](https://github.com/kokkos/kokkos-kernels/pull/1740) + - Disallow BsrMatrix tensor-core SpMV on non-scalar types [\#1937](https://github.com/kokkos/kokkos-kernels/pull/1937) + - remove triplicate sanity checks in BsrMatrix [\#1923](https://github.com/kokkos/kokkos-kernels/pull/1923) + - remove duplicate BSR SpMV tests [\#1922](https://github.com/kokkos/kokkos-kernels/pull/1922) +- Only deep_copy from device to host if supernodal sptrsv algorithms are used [\#1993](https://github.com/kokkos/kokkos-kernels/pull/1993) +- Improve KokkosSparse_kk_spmv [\#1979](https://github.com/kokkos/kokkos-kernels/pull/1979) + - Add 5 warm-up calls to get accurate, consistent timing + - Print out the matrix dimensions correctly when loading from disk +- sparse/impl: Make PSGS non-blocking [\#1917](https://github.com/kokkos/kokkos-kernels/pull/1917) + +#### ODE +- ODE: changing layout of temp mem in RK algorithms [\#1908](https://github.com/kokkos/kokkos-kernels/pull/1908) +- ODE: adding adaptivity test for RK methods [\#1896](https://github.com/kokkos/kokkos-kernels/pull/1896) + +#### Common utilities +- Common: remove half and bhalf implementations (now in Kokkos Core) [\#1981](https://github.com/kokkos/kokkos-kernels/pull/1981) +- KokkosKernels: switching from printf macro to function [\#1977](https://github.com/kokkos/kokkos-kernels/pull/1977) +- OrdinalTraits: constexpr functions [\#1976](https://github.com/kokkos/kokkos-kernels/pull/1976) +- Parallel prefix sum can infer view type [\#1974](https://github.com/kokkos/kokkos-kernels/pull/1974) + +#### TPL support +- BSPGEMM: removing cusparse testing for version older than 11.4.0 [\#1996](https://github.com/kokkos/kokkos-kernels/pull/1996) +- Revise KokkosBlas::nrm2 TPL implementation [\#1950](https://github.com/kokkos/kokkos-kernels/pull/1950) +- Add TPL oneMKL GEMV support [\#1912](https://github.com/kokkos/kokkos-kernels/pull/1912) +- oneMKL spmv [\#1882](https://github.com/kokkos/kokkos-kernels/pull/1882) + +### Build System: +- CMakeLists.txt: Update Kokkos version to 4.2.99 for version check [\#2003](https://github.com/kokkos/kokkos-kernels/pull/2003) +- CMake: Adding logic to catch bad Kokkos version [\#1990](https://github.com/kokkos/kokkos-kernels/pull/1990) +- Remove calling tribits_exclude_autotools_files() [\#1888](https://github.com/kokkos/kokkos-kernels/pull/1888) + +### Documentation and Testing: +- Update create_gs_handle docs [\#1958](https://github.com/kokkos/kokkos-kernels/pull/1958) +- docs: Add testing table [\#1876](https://github.com/kokkos/kokkos-kernels/pull/1876) +- docs: Note which builds have ETI disabled [\#1934](https://github.com/kokkos/kokkos-kernels/pull/1934) +- Generate HTML docs [\#1921](https://github.com/kokkos/kokkos-kernels/pull/1921) +- github/workflows: Pin sphinx version [\#1948](https://github.com/kokkos/kokkos-kernels/pull/1948) +- github/workflows/docs.yml: Use up-to-date doxygen version [\#1941](https://github.com/kokkos/kokkos-kernels/pull/1941) + +- Unit-Test: adding specific test for block sparse functions [\#1944](https://github.com/kokkos/kokkos-kernels/pull/1944) +- Update SYCL docker image to Cuda 11.7.1 [\#1939](https://github.com/kokkos/kokkos-kernels/pull/1939) +- Remove printouts from the unit tests of ger() and syr() [\#1933](https://github.com/kokkos/kokkos-kernels/pull/1933) +- update testing scripts [\#1960](https://github.com/kokkos/kokkos-kernels/pull/1960) +- Speed up BSR spmv tests [\#1945](https://github.com/kokkos/kokkos-kernels/pull/1945) +- Test_ODE_Newton: Add template parameters for Kokkos::pair [\#1929](https://github.com/kokkos/kokkos-kernels/pull/1929) +- par_ilut: Update documentation for fill_in_limit [\#2001](https://github.com/kokkos/kokkos-kernels/pull/2001) + +### Benchmarks: +- perf_test/sparse: Update GS perf_test for streams [\#1963](https://github.com/kokkos/kokkos-kernels/pull/1963) +- Batched sparse perf_tests: Don't write to source tree during build [\#1904](https://github.com/kokkos/kokkos-kernels/pull/1904) +- ParILUT bench: fix unused IS_GPU warning [\#1900](https://github.com/kokkos/kokkos-kernels/pull/1900) +- BsrMatrix SpMV Google Benchmark [\#1886](https://github.com/kokkos/kokkos-kernels/pull/1886) +- Use extraction timestamps for fetched Google Benchmark files [\#1881](https://github.com/kokkos/kokkos-kernels/pull/1881) +- Improve help text in perf tests [\#1875](https://github.com/kokkos/kokkos-kernels/pull/1875) + +### Cleanup: +- iostream clean-up in benchmarks [\#2004](https://github.com/kokkos/kokkos-kernels/pull/2004) +- Rename TestExecSpace to TestDevice [\#1970](https://github.com/kokkos/kokkos-kernels/pull/1970) +- remove Intel 2017 code (no longer supported) [\#1920](https://github.com/kokkos/kokkos-kernels/pull/1920) +- clean-up implementations for move of HIP outside of experimental [#1999](https://github.com/kokkos/kokkos-kernels/pull/1999) + +### Bug Fixes: +- upstream iostream removal fix [\#1991](https://github.com/kokkos/kokkos-kernels/pull/1991), [\#1995](https://github.com/kokkos/kokkos-kernels/pull/1995) +- Test and fix gemv stream interface [\#1987](https://github.com/kokkos/kokkos-kernels/pull/1987) +- Test_Sparse_spmv_bsr.hpp: Workaround cuda 11.2 compiler error [\#1983](https://github.com/kokkos/kokkos-kernels/pull/1983) +- Fix improper use of execution space instances in ODE tests. Better handling of CudaUVMSpaces during build. [\#1973](https://github.com/kokkos/kokkos-kernels/pull/1973) +- Don't assume the default memory space is used [\#1969](https://github.com/kokkos/kokkos-kernels/pull/1969) +- MDF: set default verbosity explicitly to avoid valgrind warnings [\#1968](https://github.com/kokkos/kokkos-kernels/pull/1968) +- Fix sort_and_merge functions for in-place case [\#1966](https://github.com/kokkos/kokkos-kernels/pull/1966) +- SPMV_Struct_Functor: initialize numExterior to 0 [\#1957](https://github.com/kokkos/kokkos-kernels/pull/1957) +- Use rank-1 impl types when rank-2 vector is dynamically rank 1 [\#1953](https://github.com/kokkos/kokkos-kernels/pull/1953) +- BsrMatrix: Check if CUDA is enabled before checking architecture [\#1955](https://github.com/kokkos/kokkos-kernels/pull/1955) +- Avoid enum without fixed underlying type to fix SYCL [\#1940](https://github.com/kokkos/kokkos-kernels/pull/1940) +- Fix SpAdd perf test when offset/ordinal is not int [\#1928](https://github.com/kokkos/kokkos-kernels/pull/1928) +- Add KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS definition for architectures with independent thread scheduling [\#1927](https://github.com/kokkos/kokkos-kernels/pull/1927) +- Fix cm_generate_makefile --boundscheck [\#1926](https://github.com/kokkos/kokkos-kernels/pull/1926) +- Bsr compatibility [\#1925](https://github.com/kokkos/kokkos-kernels/pull/1925) +- BLAS: fix assignable check in gemv and gemm [\#1914](https://github.com/kokkos/kokkos-kernels/pull/1914) +- mdf: fix initial value in select pivot functor [\#1916](https://github.com/kokkos/kokkos-kernels/pull/1916) +- add missing headers, std::vector -> std::vector<...> [\#1909](https://github.com/kokkos/kokkos-kernels/pull/1909) +- Add missing include to Test_Sparse_MergeMatrix.hpp [\#1907](https://github.com/kokkos/kokkos-kernels/pull/1907) +- Remove non-existant dir from CMake include paths [\#1892](https://github.com/kokkos/kokkos-kernels/pull/1892) +- cusparse 12 spmv: check y vector alignment [\#1889](https://github.com/kokkos/kokkos-kernels/pull/1889) +- Change 'or' to '||' to fix compilation on MSVC [\#1885](https://github.com/kokkos/kokkos-kernels/pull/1885) +- Add missing KokkosKernels_Macros.hpp include [\#1884](https://github.com/kokkos/kokkos-kernels/pull/1884) +- Backward-compatible fix with kokkos@4.0 [\#1874](https://github.com/kokkos/kokkos-kernels/pull/1874) +- Fix for rocblas builds [\#1871](https://github.com/kokkos/kokkos-kernels/pull/1871) +- Correcting 'syr test' bug causing compilation errors with Trilinos [\#1870](https://github.com/kokkos/kokkos-kernels/pull/1870) +- Workaround for spiluk and sptrsv stream tests with OMP_NUM_THREADS of 1, 2, 3 [\#1864](https://github.com/kokkos/kokkos-kernels/pull/1864) +- bhalf_t fix for isnan function [\#2007](https://github.com/kokkos/kokkos-kernels/pull/2007) + + +## [4.1.00](https://github.com/kokkos/kokkos-kernels/tree/4.1.00) (2023-06-16) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.01...4.1.00) + +### New Features + +#### BLAS updates +- Adding interface with execution space instance argument to support execution of BLAS on stream + - Norms on stream [\#1795](https://github.com/kokkos/kokkos-kernels/pull/1795) + - Blas1 on stream [\#1803](https://github.com/kokkos/kokkos-kernels/pull/1803) + - Blas2 and 3 on stream [\#1812](https://github.com/kokkos/kokkos-kernels/pull/1812) +- Improving BLAS level 2 support by adding native implementation and TPL for GER, HER and SYR + - Implementation for BLAS2 ger [\#1756](https://github.com/kokkos/kokkos-kernels/pull/1756) + - Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) + +#### Batched updates +- Optimizing algorithms for single input data + - Add calls to KokkosBlas Dot and Axpy for team batched kernels when m==1 [\#1753](https://github.com/kokkos/kokkos-kernels/pull/1753) + - Add calls to KokkosBlas Gemv and Spmv for team batched kernels when m==1 [\#1770](https://github.com/kokkos/kokkos-kernels/pull/1770) + +#### Sparse updates +- Adding stream support to ILUK/SPTRSV and sort/merge + - Streams interface for SPILUK numeric [\#1728](https://github.com/kokkos/kokkos-kernels/pull/1728) + - Stream interface for SPTRSV solve [\#1820](https://github.com/kokkos/kokkos-kernels/pull/1820) + - Add exec instance support to sort/sort_and_merge utils [\#1744](https://github.com/kokkos/kokkos-kernels/pull/1744) +- Add BsrMatrix SpMV in rocSparse TPL, rewrite BsrMatrix SpMV unit tests [\#1769](https://github.com/kokkos/kokkos-kernels/pull/1769) +- sparse: Add coo2crs, crs2coo and CooMatrix [\#1686](https://github.com/kokkos/kokkos-kernels/pull/1686) +- Adds team- and thread-based lower-bound and upper-bound search and predicates [\#1711](https://github.com/kokkos/kokkos-kernels/pull/1711) +- Adds KokkosKernels::Impl::Iota, a view-like where iota(i) = i + offset [\#1710](https://github.com/kokkos/kokkos-kernels/pull/1710) + +#### Misc updates +- ODE: explicit integration methods [\#1754](https://github.com/kokkos/kokkos-kernels/pull/1754) + +### Enhancements: + +#### BLAS +- refactor blas3 tests to use benchmark library [\#1751](https://github.com/kokkos/kokkos-kernels/pull/1751) + +#### Batched +- batched/eti: ETI host-level interfaces [\#1783](https://github.com/kokkos/kokkos-kernels/pull/1783) +- batched/dense: Add gesv DynRankView runtime checks [\#1850](https://github.com/kokkos/kokkos-kernels/pull/1850) + +#### Sparse +- Add support for complex data types in MDF [\#1776](https://github.com/kokkos/kokkos-kernels/pull/1776) +- Sort and merge improvements [\#1773](https://github.com/kokkos/kokkos-kernels/pull/1773) +- spgemm handle: check that A,B,C graphs never change [\#1742](https://github.com/kokkos/kokkos-kernels/pull/1742) +- Fix/enhance backend issues on spadd perftest [\#1672](https://github.com/kokkos/kokkos-kernels/pull/1672) +- Spgemm perf test enhancements [\#1664](https://github.com/kokkos/kokkos-kernels/pull/1664) +- add explicit tests of opt-in algorithms in SpMV [\#1712](https://github.com/kokkos/kokkos-kernels/pull/1712) + +#### Common utilities +- Added TplsVersion file and print methods [\#1693](https://github.com/kokkos/kokkos-kernels/pull/1693) +- Add basis skeleton for KokkosKernels::print_configuration [\#1665](https://github.com/kokkos/kokkos-kernels/pull/1665) +- Add git information to benchmark context [\#1722](https://github.com/kokkos/kokkos-kernels/pull/1722) +- Test mixed scalars: more fixes related to mixed scalar tests [\#1694](https://github.com/kokkos/kokkos-kernels/pull/1694) +- PERF TESTS: adding utilities and instantiation wrapper [\#1676](https://github.com/kokkos/kokkos-kernels/pull/1676) + +#### TPL support +- Refactor MKL TPL for both CPU and GPU usage [\#1779](https://github.com/kokkos/kokkos-kernels/pull/1779) +- MKL: support indices properly [\#1868](https://github.com/kokkos/kokkos-kernels/pull/1868) +- Use rocsparse_spmv_ex for rocm >= 5.4.0 [\#1701](https://github.com/kokkos/kokkos-kernels/pull/1701) + + +### Build System: +- Do not change memory spaces instantiation defaults based on Kokkos_ENABLE_CUDA_UVM [\#1835](https://github.com/kokkos/kokkos-kernels/pull/1835) +- KokkosKernels: Remove TriBITS Kokkos subpackages (trilinos/Trilinos#11545) [\#1817](https://github.com/kokkos/kokkos-kernels/pull/1817) +- CMakeLists.txt: Add alias to match what is exported from Trilinos [\#1855](https://github.com/kokkos/kokkos-kernels/pull/1855) +- KokkosKernels: Don't list include for non-existant 'batched' build dir (trilinos/Trilinos#11966) [\#1867](https://github.com/kokkos/kokkos-kernels/pull/1867) +- Remove non-existant subdir kokkos-kernels/common/common (#11921, #11863) [\#1854](https://github.com/kokkos/kokkos-kernels/pull/1854) +- KokkosKernels: Remove non-existent common/src/[impl,tpls] include dirs (trilinos/Trilinos#11545) [\#1844](https://github.com/kokkos/kokkos-kernels/pull/1844) + +### Documentation and Testing: +- Enable sphinx werror [\#1856](https://github.com/kokkos/kokkos-kernels/pull/1856) +- Update cmake option naming in docs/comments [\#1849](https://github.com/kokkos/kokkos-kernels/pull/1849) +- docs/developer: Add Experimental namespace [\#1852](https://github.com/kokkos/kokkos-kernels/pull/1852) +- docs: Add profiling for compile times [\#1843](https://github.com/kokkos/kokkos-kernels/pull/1843) +- Ger: adding documentation stubs in apidocs [\#1822](https://github.com/kokkos/kokkos-kernels/pull/1822) +- .github/workflows: Summarize github-DOCS errors and warnings [\#1814](https://github.com/kokkos/kokkos-kernels/pull/1814) +- Blas1: docs update for PR #1803 [\#1805](https://github.com/kokkos/kokkos-kernels/pull/1805) +- apt-get update in hosted runner docs check [\#1797](https://github.com/kokkos/kokkos-kernels/pull/1797) +- scripts: Fix github-DOCS [\#1796](https://github.com/kokkos/kokkos-kernels/pull/1796) +- Add --enable-docs option to cm_generate_makefile [\#1785](https://github.com/kokkos/kokkos-kernels/pull/1785) +- docs: Add stubs for some sparse APIs [\#1768](https://github.com/kokkos/kokkos-kernels/pull/1768) +- .github: Update to actions/checkout@v3 [\#1767](https://github.com/kokkos/kokkos-kernels/pull/1767) +- docs: Include BatchedGemm [\#1765](https://github.com/kokkos/kokkos-kernels/pull/1765) +- .github: Automation reminder [\#1726](https://github.com/kokkos/kokkos-kernels/pull/1726) +- Allow an HTML-only docs build [\#1723](https://github.com/kokkos/kokkos-kernels/pull/1723) +- SYCL CI: Specify the full path to the compiler [\#1670](https://github.com/kokkos/kokkos-kernels/pull/1670) +- Add github DOCS ci check & disable Kokkos tests [\#1647](https://github.com/kokkos/kokkos-kernels/pull/1647) +- Add rocsparse,rocblas, to enabled TPLs in cm_test_all_sandia when --spot-check-tpls [\#1841](https://github.com/kokkos/kokkos-kernels/pull/1841) +- cm_test_all_sandia: update to add caraway queues for MI210, MI250 [\#1840](https://github.com/kokkos/kokkos-kernels/pull/1840) +- Support rocSparse in rocm 5.2.0 [\#1833](https://github.com/kokkos/kokkos-kernels/pull/1833) +- Add KokkosKernels_PullRequest_VEGA908_Tpls_ROCM520 support, only enable KokkosBlas::gesv where supported [\#1816](https://github.com/kokkos/kokkos-kernels/pull/1816) +- scripts: Include OMP settings [\#1801](https://github.com/kokkos/kokkos-kernels/pull/1801) +- Print the patch that clang-format-8 wants to apply [\#1714](https://github.com/kokkos/kokkos-kernels/pull/1714) + +### Benchmarks: +- Benchmark cleanup for par_ilut and spmv [\#1853](https://github.com/kokkos/kokkos-kernels/pull/1853) +- SpMV: adding benchmark for spmv [\#1821](https://github.com/kokkos/kokkos-kernels/pull/1821) +- New performance test for par_ilut, ginkgo::par_ilut, and spill [\#1799](https://github.com/kokkos/kokkos-kernels/pull/1799) +- Include OpenMP environment variables in benchmark context [\#1789](https://github.com/kokkos/kokkos-kernels/pull/1789) +- Re-enable and clean up triangle counting perf test [\#1752](https://github.com/kokkos/kokkos-kernels/pull/1752) +- Include google/benchmark lib version in benchmark output [\#1750](https://github.com/kokkos/kokkos-kernels/pull/1750) +- Refactor blas2 test for benchmark feature [\#1733](https://github.com/kokkos/kokkos-kernels/pull/1733) +- Adds a better parilut test with gmres [\#1661](https://github.com/kokkos/kokkos-kernels/pull/1661) +- Refactor blas1 test for benchmark feature [\#1636](https://github.com/kokkos/kokkos-kernels/pull/1636) + +### Cleanup: +- Drop outdated workarounds for backward compatibility with Kokkos [\#1836](https://github.com/kokkos/kokkos-kernels/pull/1836) +- Remove dead code guarded [\#1834](https://github.com/kokkos/kokkos-kernels/pull/1834) +- Remove decl ETI files [\#1824](https://github.com/kokkos/kokkos-kernels/pull/1824) +- Reorganize par_ilut performance test [\#1818](https://github.com/kokkos/kokkos-kernels/pull/1818) +- Deprecate Kokkos::Details::ArithTraits [\#1748](https://github.com/kokkos/kokkos-kernels/pull/1748) +- Drop obsolete workaround #ifdef KOKKOS_IF_ON_HOST [\#1720](https://github.com/kokkos/kokkos-kernels/pull/1720) +- Drop pre Kokkos 3.6 workaround [\#1653](https://github.com/kokkos/kokkos-kernels/pull/1653) +- View::Rank -> View::rank [\#1703](https://github.com/kokkos/kokkos-kernels/pull/1703) +- Prefer Kokkos::View::{R->r}ank [\#1679](https://github.com/kokkos/kokkos-kernels/pull/1679) +- Call concurrency(), not impl_thread_pool_size() [\#1666](https://github.com/kokkos/kokkos-kernels/pull/1666) +- Kokkos moves ALL_t out of Impl namespace [\#1658](https://github.com/kokkos/kokkos-kernels/pull/1658) +- Add KokkosKernels::Impl::are_integral_v helper variable template and quit using Kokkos::Impl::are_integral trait [\#1652](https://github.com/kokkos/kokkos-kernels/pull/1652) + +### Bug Fixes: +- Kokkos 4 compatibility: modifying the preprocessor logic [\#1827](https://github.com/kokkos/kokkos-kernels/pull/1827) +- blas/tpls: Fix gemm include guard typo [\#1848](https://github.com/kokkos/kokkos-kernels/pull/1848) +- spmv cusparse version check modified for cuda/11.1 [\#1828](https://github.com/kokkos/kokkos-kernels/pull/1828) +- Workaround for #1777 - cusparse spgemm test hang [\#1811](https://github.com/kokkos/kokkos-kernels/pull/1811) +- Fix 1798 [\#1800](https://github.com/kokkos/kokkos-kernels/pull/1800) +- BLAS: fixes and testing for LayoutStride [\#1794](https://github.com/kokkos/kokkos-kernels/pull/1794) +- Fix 1786: check that work array is contiguous in SVD [\#1793](https://github.com/kokkos/kokkos-kernels/pull/1793) +- Fix unused variable warnings [\#1790](https://github.com/kokkos/kokkos-kernels/pull/1790) +- Use KOKKOS_IMPL_DO_NOT_USE_PRINTF in Test_Common_UpperBound.hpp [\#1784](https://github.com/kokkos/kokkos-kernels/pull/1784) +- Batched Gesv: initializing variable to make compiler happy [\#1778](https://github.com/kokkos/kokkos-kernels/pull/1778) +- perf test utils: fix device ID parsing [\#1739](https://github.com/kokkos/kokkos-kernels/pull/1739) +- Fix OOB and improve comments in BsrMatrix COO constructor [\#1732](https://github.com/kokkos/kokkos-kernels/pull/1732) +- batched/unit_test: Disable simd dcomplex4 test in for intel > 19.05 and <= 2021. [\#1857](https://github.com/kokkos/kokkos-kernels/pull/1857) +- rocsparse spmv tpl: Fix rocsparse_spmv call for rocm < 5.4.0 [\#1716](https://github.com/kokkos/kokkos-kernels/pull/1716) +- compatibility with 4.0.0 [\#1709](https://github.com/kokkos/kokkos-kernels/pull/1709) +- team mult: fix type issue in max_error calculation [\#1706](https://github.com/kokkos/kokkos-kernels/pull/1706) +- cast Kokkos::Impl::integral_constant to int [\#1697](https://github.com/kokkos/kokkos-kernels/pull/1697) + + ## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-04-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.00...4.0.01) From 6a55d793427baf19ebb5e1fb5f38aef28fb1e1a4 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 7 Nov 2023 10:04:37 -0700 Subject: [PATCH 083/326] NRM1: refactoring TPL layer a bit with c++17 if constexpr Hopefully this leads to simpler code, less duplication, less macro and easier maintenance! Adding support for oneapi MKL while making tpl layer changes. --- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 34 + blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 772 +++++++----------- blas/unit_test/Test_Blas1_nrm1.hpp | 8 +- 3 files changed, 335 insertions(+), 479 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 04ec811990..be0a45c7be 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -113,6 +113,40 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +// oneMKL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) + +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View< \ + typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) + +#endif // KOKKOS_ENABLE_SYCL +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL + } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index b5b6e061ec..2e2c98a579 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -39,32 +39,39 @@ inline void nrm1_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ + using mag_type = typename Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ \ static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS," #SCALAR "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ nrm1_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = HostBlas::asum(N, X.data(), one); \ + if constexpr (Kokkos::ArithTraits::is_complex) { \ + R() = HostBlas>::asum( \ + N, reinterpret_cast*>(X.data()), \ + one); \ + } else { \ + R() = HostBlas::asum(N, X.data(), one); \ + } \ } else { \ Nrm1::nrm1(space, R, X); \ } \ @@ -72,128 +79,25 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm1< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::asum(N, X.data(), one); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, + Kokkos::HostSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, + Kokkos::HostSpace, false) -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::asum( \ - N, reinterpret_cast*>(X.data()), one); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, + Kokkos::HostSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, + Kokkos::HostSpace, false) -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::asum( \ - N, reinterpret_cast*>(X.data()), one); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, false) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -207,40 +111,65 @@ KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +template +void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, + const XViewType& X) { + using XScalar = typename XViewType::non_const_value_type; + + nrm1_print_specialization(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::CudaBlasSingleton& s = + KokkosBlas::Impl::CudaBlasSingleton::singleton(); + + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasSasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasDasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasScasum(s.handle, N, reinterpret_cast(X.data()), + one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( + s.handle, N, reinterpret_cast(X.data()), one, + R.data())); + } + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); +} + +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ template <> \ struct Nrm1< \ EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ 1, true, ETI_SPEC_AVAIL> { \ using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ + using RV = Kokkos::View::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ \ static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS," #SCALAR \ + "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + cublasAsumWrapper(space, R, X); \ } else { \ Nrm1::nrm1(space, \ R, X); \ @@ -249,160 +178,33 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( \ - s.handle, N, reinterpret_cast(X.data()), \ - one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScasum( \ - s.handle, N, reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas - -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS @@ -411,39 +213,65 @@ KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +template +void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, + const XViewType& X) { + using XScalar = typename XViewType::non_const_value_type; + + nrm1_print_specialization(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::RocBlasSingleton& s = + KokkosBlas::Impl::RocBlasSingleton::singleton(); + + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_set_stream(s.handle, space.hip_stream())); + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_sasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_dasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( + s.handle, N, reinterpret_cast(X.data()), + one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( + s.handle, N, reinterpret_cast(X.data()), + one, R.data())); + } + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); +} + +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ + using RV = Kokkos::View::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ \ static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS," #SCALAR \ + "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + rocblasAsumWrapper(space, R, X); \ } else { \ Nrm1::nrm1(space, R, X); \ } \ @@ -451,155 +279,149 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( \ - s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +} // namespace Impl +} // namespace KokkosBlas -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( \ - s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +// oneMKL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +#include +#include -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +namespace KokkosBlas { +namespace Impl { + +template +void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, + const XViewType& X) { + using XScalar = typename XViewType::non_const_value_type; + using KAT_X = Kokkos::ArithTraits; + using layout_t = typename XViewType::array_layout; + + const std::int64_t N = static_cast(X.extent(0)); + const std::int64_t one = Kokkos::ArithTraits::one(); + + // Create temp view on device to store the result + Kokkos::View::mag_type, + typename XViewType::memory_space> + res("sycl asum result"); + + // Decide to call row_major or column_major function + if constexpr (std::is_same_v) { + if constexpr (KAT_X::is_complex) { + oneapi::mkl::blas::row_major::asum( + space.sycl_queue(), N, + reinterpret_cast*>( + X.data()), + 1, res.data()); + } else { + oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, X.data(), 1, + res.data()); + } + } else { + if constexpr (KAT_X::is_complex) { + oneapi::mkl::blas::column_major::asum( + space.sycl_queue(), N, + reinterpret_cast*>( + X.data()), + 1, res.data()); + } else { + oneapi::mkl::blas::column_major::asum(space.sycl_queue(), X.extent_int(0), + X.data(), 1, res.data()); + } + } + // Bring result back to host + Kokkos::deep_copy(space, R, res); +} + +#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm1< \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + using RV = Kokkos::View::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ONEMKL," #SCALAR \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + onemklAsumWrapper(space, R, X); \ + } else { \ + Nrm1::nrm1(space, \ + R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutRight, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutRight, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOS_ENABLE_SYCL +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL #endif diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index f6938c5147..24795878d1 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -22,10 +22,10 @@ namespace Test { template void impl_test_nrm1(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; - typedef typename AT::mag_type mag_type; - typedef Kokkos::ArithTraits MAT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using mag_type = typename AT::mag_type; + using MAT = Kokkos::ArithTraits; view_stride_adapter a("a", N); From 9007f55f6ac3779112db34db8318a30797c63a2a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 20 Nov 2023 17:07:20 -0700 Subject: [PATCH 084/326] BLAS: Nrm1 implementing Brian's feedback --- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 194 +++++++++++-------- 1 file changed, 110 insertions(+), 84 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 2e2c98a579..bbd7e4139e 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -39,26 +39,34 @@ inline void nrm1_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct Nrm1< \ - ExecSpace, \ + EXECSPACE, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ + 1, true, \ + nrm1_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ using mag_type = typename Kokkos::ArithTraits::mag_type; \ using RV = Kokkos::View>; \ using XV = Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>; \ using size_type = typename XV::size_type; \ \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + static void nrm1(const EXECSPACE& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS," #SCALAR "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -73,31 +81,35 @@ namespace Impl { R() = HostBlas::asum(N, X.data(), one); \ } \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) - +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) - + Kokkos::Serial, Kokkos::HostSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) + Kokkos::Serial, Kokkos::HostSpace) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) + Kokkos::OpenMP, Kokkos::HostSpace) +#endif } // namespace Impl } // namespace KokkosBlas @@ -145,7 +157,7 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, } #define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ + MEMSPACE) \ template <> \ struct Nrm1< \ EXECSPACE, \ @@ -154,7 +166,15 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ + 1, true, \ + nrm1_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ using execution_space = EXECSPACE; \ using RV = Kokkos::View::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -171,36 +191,37 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, if (numElems < static_cast(INT_MAX)) { \ cublasAsumWrapper(space, R, X); \ } else { \ - Nrm1::nrm1(space, \ - R, X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) + Kokkos::CudaSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - + Kokkos::CudaSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) + Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) + Kokkos::CudaUVMSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) + Kokkos::CudaUVMSpace) +#endif } // namespace Impl } // namespace KokkosBlas @@ -247,8 +268,7 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct Nrm1< \ ExecSpace, \ @@ -257,7 +277,15 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ + 1, true, \ + nrm1_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ using RV = Kokkos::View::mag_type, \ LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits>; \ @@ -273,35 +301,22 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, if (numElems < static_cast(INT_MAX)) { \ rocblasAsumWrapper(space, R, X); \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) + Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) + Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) - -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) + Kokkos::LayoutLeft, Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosBlas @@ -327,8 +342,7 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, using KAT_X = Kokkos::ArithTraits; using layout_t = typename XViewType::array_layout; - const std::int64_t N = static_cast(X.extent(0)); - const std::int64_t one = Kokkos::ArithTraits::one(); + const std::int64_t N = static_cast(X.extent(0)); // Create temp view on device to store the result Kokkos::View::mag_type, @@ -373,7 +387,15 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ + 1, true, \ + nrm1_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ using execution_space = EXECSPACE; \ using RV = Kokkos::View::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -390,33 +412,37 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, if (numElems < static_cast(INT_MAX)) { \ onemklAsumWrapper(space, R, X); \ } else { \ - Nrm1::nrm1(space, \ - R, X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutRight, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) + Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutRight, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) + Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) + Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) + Kokkos::Experimental::SYCLSharedUSMSpace) +#endif } // namespace Impl } // namespace KokkosBlas From d7f5e8ea237e48aba9a6645e9dc2ca5d5a63b5b1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 21 Nov 2023 10:39:09 -0700 Subject: [PATCH 085/326] Blas: nrm1, fix in tpl spec decl --- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index bbd7e4139e..79822b452e 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -50,7 +50,7 @@ namespace Impl { Kokkos::View, \ Kokkos::MemoryTraits>, \ 1, true, \ - nrm1_tpl_spec_avail< \ + nrm1_eti_spec_avail< \ EXECSPACE, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ @@ -82,7 +82,7 @@ namespace Impl { } \ } else { \ Nrm1::value>::nrm1(space, R, \ + nrm1_eti_spec_avail::value>::nrm1(space, R, \ X); \ } \ Kokkos::Profiling::popRegion(); \ @@ -167,7 +167,7 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, Kokkos::View, \ Kokkos::MemoryTraits>, \ 1, true, \ - nrm1_tpl_spec_avail< \ + nrm1_eti_spec_avail< \ EXECSPACE, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ @@ -192,7 +192,7 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, cublasAsumWrapper(space, R, X); \ } else { \ Nrm1::value>::nrm1(space, R, \ + nrm1_eti_spec_avail::value>::nrm1(space, R, \ X); \ } \ Kokkos::Profiling::popRegion(); \ @@ -278,7 +278,7 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, Kokkos::View, \ Kokkos::MemoryTraits>, \ 1, true, \ - nrm1_tpl_spec_avail< \ + nrm1_eti_spec_avail< \ ExecSpace, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ @@ -302,7 +302,7 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, rocblasAsumWrapper(space, R, X); \ } else { \ Nrm1::value>::nrm1(space, R, \ + nrm1_eti_spec_avail::value>::nrm1(space, R, \ X); \ } \ Kokkos::Profiling::popRegion(); \ @@ -388,7 +388,7 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, Kokkos::View, \ Kokkos::MemoryTraits>, \ 1, true, \ - nrm1_tpl_spec_avail< \ + nrm1_eti_spec_avail< \ EXECSPACE, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ @@ -413,7 +413,7 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, onemklAsumWrapper(space, R, X); \ } else { \ Nrm1::value>::nrm1(space, R, \ + nrm1_eti_spec_avail::value>::nrm1(space, R, \ X); \ } \ Kokkos::Profiling::popRegion(); \ From b1cea63873fcedeb60c2ed6c29e930c5a6621a4f Mon Sep 17 00:00:00 2001 From: "Luc Berger-Vergiat (-EXP)" Date: Tue, 21 Nov 2023 15:39:18 -0700 Subject: [PATCH 086/326] BLAS: nrm1 problems with ExecSpace template and lack of Kokkos::Threads Fix issue with Kokkos::Threads and Kokkos::HIP --- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 109 ++++++++++--------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 79822b452e..c695eaee1e 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -111,6 +111,17 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) #endif +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +#endif + } // namespace Impl } // namespace KokkosBlas @@ -156,31 +167,31 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); } -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template <> \ struct Nrm1< \ - EXECSPACE, \ + Kokkos::Cuda, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ 1, true, \ nrm1_eti_spec_avail< \ - EXECSPACE, \ + Kokkos::Cuda, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>>::value> { \ - using execution_space = EXECSPACE; \ + using execution_space = Kokkos::Cuda; \ using RV = Kokkos::View::mag_type, \ LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits>; \ using XV = Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>; \ using size_type = typename XV::size_type; \ \ @@ -192,35 +203,31 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, cublasAsumWrapper(space, R, X); \ } else { \ Nrm1::value>::nrm1(space, R, \ - X); \ + nrm1_eti_spec_avail::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -269,41 +276,42 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, } #define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ + template <> \ struct Nrm1< \ - ExecSpace, \ + Kokkos::HIP, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ 1, true, \ nrm1_eti_spec_avail< \ - ExecSpace, \ + Kokkos::HIP, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>>::value> { \ using RV = Kokkos::View::mag_type, \ LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits>; \ using XV = Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>; \ using size_type = typename XV::size_type; \ \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + static void nrm1(const Kokkos::HIP& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS," #SCALAR \ "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ rocblasAsumWrapper(space, R, X); \ } else { \ - Nrm1::value>::nrm1(space, R, \ - X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -377,32 +385,33 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, Kokkos::deep_copy(space, R, res); } -#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, MEMSPACE) \ template <> \ struct Nrm1< \ - EXECSPACE, \ + Kokkos::Experimental::SYCL, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ 1, true, \ nrm1_eti_spec_avail< \ - EXECSPACE, \ + Kokkos::Experimental::SYCL, \ Kokkos::View::mag_type, LAYOUT, \ Kokkos::HostSpace, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits>>::value> { \ - using execution_space = EXECSPACE; \ + using execution_space = Kokkos::Experimental::SYCL; \ using RV = Kokkos::View::mag_type, \ LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits>; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using XV = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ using size_type = typename XV::size_type; \ \ static void nrm1(const execution_space& space, RV& R, const XV& X) { \ @@ -413,34 +422,30 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, onemklAsumWrapper(space, R, X); \ } else { \ Nrm1::value>::nrm1(space, R, \ - X); \ + nrm1_eti_spec_avail::value>::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) -KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLSharedUSMSpace) KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLSharedUSMSpace) #endif From fbaac455a8c56000c318b38491650444f08e3cf6 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 22 Nov 2023 14:46:22 -0700 Subject: [PATCH 087/326] Another attempt while waiting to get access to the solo cluster --- .../Test_Blas1_axpby_unification.hpp | 44 ++++++++++++++----- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index c75a4138cd..ae04b357fa 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -356,23 +356,47 @@ void impl_test_axpby_mv_unification_compare( (void)valueB; // Avoid "set but not used" error int a_k(a.h_view.extent(0) == 1 ? 0 : k); int b_k(b.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k) + - b.h_view(b_k) * org_y.h_view(i, k)); + if (b.h_view(b_k) == Kokkos::ArithTraits::zero()) { + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k)); + } + else { + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k) + + b.h_view(b_k) * org_y.h_view(i, k)); + } } else { int a_k(a.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + if (valueB == Kokkos::ArithTraits::zero()) { + vanillaValue = static_cast( + a.h_view(a_k) * x.h_view(i, k)); + } + else { + vanillaValue = static_cast( + a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + } } } else { if constexpr (bIsRank1) { (void)valueB; // Avoid "set but not used" error int b_k(b.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + if (b.h_view(b_k) == Kokkos::ArithTraits::zero()) { + vanillaValue = static_cast( + valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + } + else { + vanillaValue = static_cast( + valueA * x.h_view(i, k)); + } } else { - vanillaValue = static_cast( - valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + if (valueB == Kokkos::ArithTraits::zero()) { + vanillaValue = static_cast( + valueA * x.h_view(i, k)); + } + else { + vanillaValue = static_cast( + valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + } } } @@ -1043,7 +1067,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // in which the result is computed. using MagnitudeB = typename Kokkos::ArithTraits::mag_type; MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); - MagnitudeB const max_val = 40; + MagnitudeB const max_val = 100; MagnitudeB const max_error = static_cast( Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + From 9285b6a8ce5265426a300e1684c50b42278345b0 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 22 Nov 2023 14:49:09 -0700 Subject: [PATCH 088/326] Formatting --- .../Test_Blas1_axpby_unification.hpp | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index ae04b357fa..615ae1602a 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -359,8 +359,7 @@ void impl_test_axpby_mv_unification_compare( if (b.h_view(b_k) == Kokkos::ArithTraits::zero()) { vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k)); - } - else { + } else { vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); @@ -368,10 +367,9 @@ void impl_test_axpby_mv_unification_compare( } else { int a_k(a.h_view.extent(0) == 1 ? 0 : k); if (valueB == Kokkos::ArithTraits::zero()) { - vanillaValue = static_cast( - a.h_view(a_k) * x.h_view(i, k)); - } - else { + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k)); + } else { vanillaValue = static_cast( a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } @@ -383,17 +381,13 @@ void impl_test_axpby_mv_unification_compare( if (b.h_view(b_k) == Kokkos::ArithTraits::zero()) { vanillaValue = static_cast( valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); - } - else { - vanillaValue = static_cast( - valueA * x.h_view(i, k)); + } else { + vanillaValue = static_cast(valueA * x.h_view(i, k)); } } else { if (valueB == Kokkos::ArithTraits::zero()) { - vanillaValue = static_cast( - valueA * x.h_view(i, k)); - } - else { + vanillaValue = static_cast(valueA * x.h_view(i, k)); + } else { vanillaValue = static_cast( valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } From 88cec7bda914fb6884ec60761aa9e373ef85bdd0 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 22 Nov 2023 15:12:24 -0700 Subject: [PATCH 089/326] Correction error from the last commit --- blas/unit_test/Test_Blas1_axpby_unification.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 615ae1602a..92b3769b8f 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -379,10 +379,10 @@ void impl_test_axpby_mv_unification_compare( (void)valueB; // Avoid "set but not used" error int b_k(b.h_view.extent(0) == 1 ? 0 : k); if (b.h_view(b_k) == Kokkos::ArithTraits::zero()) { + vanillaValue = static_cast(valueA * x.h_view(i, k)); + } else { vanillaValue = static_cast( valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); - } else { - vanillaValue = static_cast(valueA * x.h_view(i, k)); } } else { if (valueB == Kokkos::ArithTraits::zero()) { From 4450d204e181ba25a16c00fd007d923e219546ad Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 22 Nov 2023 21:45:59 -0700 Subject: [PATCH 090/326] Fixing the error that was happening only at the solo cluster --- .../Test_Blas1_axpby_unification.hpp | 72 +++++++++++-------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 92b3769b8f..d33cfd55ad 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -356,45 +356,48 @@ void impl_test_axpby_mv_unification_compare( (void)valueB; // Avoid "set but not used" error int a_k(a.h_view.extent(0) == 1 ? 0 : k); int b_k(b.h_view.extent(0) == 1 ? 0 : k); - if (b.h_view(b_k) == Kokkos::ArithTraits::zero()) { - vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k)); - } else { - vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k) + - b.h_view(b_k) * org_y.h_view(i, k)); - } +#if 0 + std::cout << "In impl_test_axpby_mv_unification_compare()" + << ": i = " << i + << ", k = " << k + << ", a.h_view.extent(0) = " << a.h_view.extent(0) + << ", a_k = " << a_k + << ", b.h_view.extent(0) = " << b.h_view.extent(0) + << ", b_k = " << b_k + << ", a.h_view(a_k) = " << a.h_view(a_k) + << ", x.h_view(i, k) = " << x.h_view(i, k) + << ", b.h_view(b_k) = " << b.h_view(b_k) + << ", org_y.h_view(i, k) = " << org_y.h_view(i, k) + << std::endl; +#endif + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k) + + b.h_view(b_k) * org_y.h_view(i, k)); } else { int a_k(a.h_view.extent(0) == 1 ? 0 : k); - if (valueB == Kokkos::ArithTraits::zero()) { - vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k)); - } else { - vanillaValue = static_cast( - a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); - } + vanillaValue = static_cast( + a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } } else { if constexpr (bIsRank1) { (void)valueB; // Avoid "set but not used" error int b_k(b.h_view.extent(0) == 1 ? 0 : k); - if (b.h_view(b_k) == Kokkos::ArithTraits::zero()) { - vanillaValue = static_cast(valueA * x.h_view(i, k)); - } else { - vanillaValue = static_cast( - valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); - } + vanillaValue = static_cast( + valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); } else { - if (valueB == Kokkos::ArithTraits::zero()) { - vanillaValue = static_cast(valueA * x.h_view(i, k)); - } else { - vanillaValue = static_cast( - valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); - } + vanillaValue = static_cast( + valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } } - - EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); +#if 0 + std::cout << "In impl_test_axpby_mv_unification_compare(1)" + << ": i = " << i + << ", k = " << k + << ", y.h_view(i, k) = " << y.h_view(i, k) + << ", vanillaValue = " << vanillaValue + << std::endl; +#endif + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 3. * max_error); } } } else { @@ -451,7 +454,14 @@ void impl_test_axpby_mv_unification_compare( } else { EXPECT_NE(y.h_view(i, k), Kokkos::ArithTraits::nan()); } - +#if 0 + std::cout << "In impl_test_axpby_mv_unification_compare(2)" + << ": i = " << i + << ", k = " << k + << ", y.h_view(i, k) = " << y.h_view(i, k) + << ", vanillaValue = " << vanillaValue + << std::endl; +#endif EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); } } @@ -1061,7 +1071,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // in which the result is computed. using MagnitudeB = typename Kokkos::ArithTraits::mag_type; MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); - MagnitudeB const max_val = 100; + MagnitudeB const max_val = 10; MagnitudeB const max_error = static_cast( Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + From 9105a8a059358a2901b92b026ce31c57982e161d Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 22 Nov 2023 23:40:03 -0700 Subject: [PATCH 091/326] Increase tolerance a bit more --- blas/unit_test/Test_Blas1_axpby_unification.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index d33cfd55ad..2284f28586 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -397,7 +397,7 @@ void impl_test_axpby_mv_unification_compare( << ", vanillaValue = " << vanillaValue << std::endl; #endif - EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 3. * max_error); + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 4. * max_error); } } } else { From baab6f59fe26b1860d417477051c8703daa8d8b2 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 23 Nov 2023 02:10:53 -0700 Subject: [PATCH 092/326] ncreasing tolerances in all 4 locations --- blas/unit_test/Test_Blas1_axpby_unification.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 2284f28586..9709d580b3 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -191,7 +191,7 @@ void impl_test_axpby_unification_compare( for (int i(0); i < N; ++i) { EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + valueB * org_y.h_view(i)), - y.h_view(i), 2. * max_error); + y.h_view(i), 4. * max_error); } } else { // ******************************************************** @@ -221,7 +221,7 @@ void impl_test_axpby_unification_compare( EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); } EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), - y.h_view(i), 2. * max_error); + y.h_view(i), 4. * max_error); } } } @@ -462,7 +462,7 @@ void impl_test_axpby_mv_unification_compare( << ", vanillaValue = " << vanillaValue << std::endl; #endif - EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 2. * max_error); + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 4. * max_error); } } } From 44d8a26735d585873dea9102f6fca6d31191ef68 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 21 Nov 2023 15:26:39 -0700 Subject: [PATCH 093/326] Backup --- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 8 +++---- .../src/KokkosSparse_gauss_seidel_handle.hpp | 6 ++--- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 24 +++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 501e71e3e7..28f2997e36 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -555,22 +555,22 @@ class ClusterGaussSeidel { } nnz_view_t vertClusters; auto clusterAlgo = gsHandle->get_clustering_algo(); - if (clusterAlgo == CLUSTER_DEFAULT) clusterAlgo = CLUSTER_MIS2; + if (clusterAlgo == ClusteringAlgorithm::CLUSTER_DEFAULT) clusterAlgo = ClusteringAlgorithm::CLUSTER_MIS2; switch (clusterAlgo) { - case CLUSTER_MIS2: { + case ClusteringAlgorithm::CLUSTER_MIS2: { vertClusters = KokkosGraph::graph_mis2_aggregate( raw_sym_xadj, raw_sym_adj, numClusters); break; } - case CLUSTER_BALLOON: { + case ClusteringAlgorithm::CLUSTER_BALLOON: { BalloonClustering balloon( num_rows, raw_sym_xadj, raw_sym_adj); vertClusters = balloon.run(clusterSize); break; } - case CLUSTER_DEFAULT: { + case ClusteringAlgorithm::CLUSTER_DEFAULT: { throw std::logic_error( "Logic to choose default clustering algorithm is incorrect"); } diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 649229918d..7c10ea6eb8 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -29,7 +29,7 @@ namespace KokkosSparse { enum GSAlgorithm { GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER, GS_TWOSTAGE }; enum GSDirection { GS_FORWARD, GS_BACKWARD, GS_SYMMETRIC }; -enum ClusteringAlgorithm { +enum struct ClusteringAlgorithm { CLUSTER_DEFAULT, CLUSTER_MIS2, CLUSTER_BALLOON, @@ -38,8 +38,8 @@ enum ClusteringAlgorithm { inline const char *getClusterAlgoName(ClusteringAlgorithm ca) { switch (ca) { - case CLUSTER_BALLOON: return "Balloon"; - case CLUSTER_MIS2: return "MIS(2)"; + case ClusteringAlgorithm::CLUSTER_BALLOON: return "Balloon"; + case ClusteringAlgorithm::CLUSTER_MIS2: return "MIS(2)"; default:; } return "INVALID CLUSTERING ALGORITHM"; diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 35fbcb44a4..ad54ea309b 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -100,13 +100,13 @@ void run_gauss_seidel( template void run_gauss_seidel( - crsMat_t input_mat, GSAlgorithm gs_algorithm, vec_t x_vector, - vec_t y_vector, bool is_symmetric_graph, - int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. - int cluster_size = 1, - bool classic = + crsMat_t input_mat, GSAlgorithm gs_algorithm, vec_t x_vector, // Aqui 1-3 + vec_t y_vector, bool is_symmetric_graph, // Aqui 4-5 + int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. // Aqui 6 + int cluster_size = 1, // Aqui 7 + bool classic = // Aqui 8 false, // only with two-stage, true for sptrsv instead of richardson - ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT, + ClusteringAlgorithm clusterAlgo = ClusteringAlgorithm::CLUSTER_DEFAULT, KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT) { using size_type = typename crsMat_t::size_type; @@ -246,8 +246,8 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, } //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; - std::vector clusteringAlgos = {CLUSTER_MIS2, - CLUSTER_BALLOON}; + std::vector clusteringAlgos = {ClusteringAlgorithm::CLUSTER_MIS2, + ClusteringAlgorithm::CLUSTER_BALLOON}; for (int csize = 0; csize < 3; csize++) { for (auto clusterAlgo : clusteringAlgos) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { @@ -350,13 +350,13 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; for (int csize = 0; csize < 3; csize++) { - for (int algo = 0; algo < (int)NUM_CLUSTERING_ALGORITHMS; algo++) { + for (int algo = 0; algo < (int)ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; algo++) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; // Zero out X before solving Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel(input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, - apply_type, clusterSizes[csize], + run_gauss_seidel(input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, // Aqui + apply_type, clusterSizes[csize], false, (ClusteringAlgorithm)algo); Kokkos::deep_copy(x_host, x_vector); for (lno_t i = 0; i < numVecs; i++) { @@ -553,7 +553,7 @@ void test_gauss_seidel_empty() { for (const int rowmapLen : {0, 1, 5}) { KernelHandle kh; if (doingCluster) - kh.create_gs_handle(CLUSTER_DEFAULT, 10); + kh.create_gs_handle(ClusteringAlgorithm::CLUSTER_DEFAULT, 10); else kh.create_gs_handle(GS_DEFAULT); const auto nRows = KOKKOSKERNELS_MACRO_MAX(0, rowmapLen - 1); From 3ba6dedd5d9bd3f223cd4a9b5ea96a049266c4d0 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 21 Nov 2023 18:33:51 -0700 Subject: [PATCH 094/326] Backup --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index ad54ea309b..abd557047f 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -100,11 +100,11 @@ void run_gauss_seidel( template void run_gauss_seidel( - crsMat_t input_mat, GSAlgorithm gs_algorithm, vec_t x_vector, // Aqui 1-3 - vec_t y_vector, bool is_symmetric_graph, // Aqui 4-5 - int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. // Aqui 6 - int cluster_size = 1, // Aqui 7 - bool classic = // Aqui 8 + crsMat_t input_mat, GSAlgorithm gs_algorithm, vec_t x_vector, + vec_t y_vector, bool is_symmetric_graph, + int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. + int cluster_size = 1, + bool classic = false, // only with two-stage, true for sptrsv instead of richardson ClusteringAlgorithm clusterAlgo = ClusteringAlgorithm::CLUSTER_DEFAULT, KokkosGraph::ColoringAlgorithm coloringAlgo = @@ -355,7 +355,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Timer timer1; // Zero out X before solving Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel(input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, // Aqui + run_gauss_seidel(input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], false, (ClusteringAlgorithm)algo); Kokkos::deep_copy(x_host, x_vector); From 168eb0e137d9586913d5ac5938ec804e33798d9a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 24 Nov 2023 21:26:21 -0700 Subject: [PATCH 095/326] Formatting --- sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp | 3 ++- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 28f2997e36..6cf8df991c 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -555,7 +555,8 @@ class ClusterGaussSeidel { } nnz_view_t vertClusters; auto clusterAlgo = gsHandle->get_clustering_algo(); - if (clusterAlgo == ClusteringAlgorithm::CLUSTER_DEFAULT) clusterAlgo = ClusteringAlgorithm::CLUSTER_MIS2; + if (clusterAlgo == ClusteringAlgorithm::CLUSTER_DEFAULT) + clusterAlgo = ClusteringAlgorithm::CLUSTER_MIS2; switch (clusterAlgo) { case ClusteringAlgorithm::CLUSTER_MIS2: { vertClusters = diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index abd557047f..c398a1f759 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -246,8 +246,8 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, } //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; - std::vector clusteringAlgos = {ClusteringAlgorithm::CLUSTER_MIS2, - ClusteringAlgorithm::CLUSTER_BALLOON}; + std::vector clusteringAlgos = { + ClusteringAlgorithm::CLUSTER_MIS2, ClusteringAlgorithm::CLUSTER_BALLOON}; for (int csize = 0; csize < 3; csize++) { for (auto clusterAlgo : clusteringAlgos) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { @@ -350,7 +350,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; for (int csize = 0; csize < 3; csize++) { - for (int algo = 0; algo < (int)ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; algo++) { + for (int algo = 0; + algo < (int)ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; algo++) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; // Zero out X before solving From 5e714d7f590c20f0659bc4cf1b41e42991706ed2 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 24 Nov 2023 23:09:46 -0700 Subject: [PATCH 096/326] Forgot to add ClusteringAlgorithm:: at some spots --- perf_test/sparse/KokkosSparse_gs.cpp | 6 +++--- perf_test/sparse/KokkosSparse_pcg.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 163fdb2dd1..deb47796c6 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -59,7 +59,7 @@ struct GS_Parameters { // Point: int longRowThreshold = 0; // Cluster: - ClusteringAlgorithm coarse_algo = CLUSTER_DEFAULT; + ClusteringAlgorithm coarse_algo = ClusteringAlgorithm::CLUSTER_DEFAULT; int cluster_size = 10; // Two stage: bool classic = false; @@ -455,9 +455,9 @@ int main(int argc, char** argv) { else if (!strcmp(argv[i], "--coarse-algo")) { const char* algo = getNextArg(i, argc, argv); if (!strcmp(algo, "balloon")) - params.coarse_algo = CLUSTER_BALLOON; + params.coarse_algo = ClusteringAlgorithm::CLUSTER_BALLOON; else if (!strcmp(algo, "mis2")) - params.coarse_algo = CLUSTER_MIS2; + params.coarse_algo = ClusteringAlgorithm::CLUSTER_MIS2; else { std::cout << "Error: invalid coarsening algorithm. Options are balloon " "and mis2.\n"; diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 9825f7c90d..465e110e9c 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -83,7 +83,7 @@ void run_experiment(crsMat_t crsmat, int clusterSize, bool useSequential) { if (clusterSize == 1) kh.create_gs_handle(); else - kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, clusterSize); + kh.create_gs_handle(KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON, clusterSize); Kokkos::Timer timer1; KokkosKernels::Experimental::Example::pcgsolve( kh, crsmat, kok_b_vector, kok_x_vector, cg_iteration_limit, From 4006d80700826934b1d503044696230c4ee739a6 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 24 Nov 2023 23:12:59 -0700 Subject: [PATCH 097/326] Formatting --- perf_test/sparse/KokkosSparse_pcg.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 465e110e9c..44519a7ad0 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -83,7 +83,8 @@ void run_experiment(crsMat_t crsmat, int clusterSize, bool useSequential) { if (clusterSize == 1) kh.create_gs_handle(); else - kh.create_gs_handle(KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON, clusterSize); + kh.create_gs_handle(KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON, + clusterSize); Kokkos::Timer timer1; KokkosKernels::Experimental::Example::pcgsolve( kh, crsmat, kok_b_vector, kok_x_vector, cg_iteration_limit, From d1aa2b09b27689e0576855fe573a832584a89b82 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 22 Nov 2023 16:05:37 -0700 Subject: [PATCH 098/326] Lapack: fixing issue with Magma TPL in gesv, trtri, etc... Adding proper support for MAGMA after having it moved to the Lapack directory and checking it does not create issues with cuSOLVER. --- blas/src/KokkosBlas1_swap.hpp | 16 ++-- blas/unit_test/Test_Blas1_swap.hpp | 21 +++-- lapack/src/KokkosLapack_gesv.hpp | 25 ++++++ lapack/tpls/KokkosLapack_Cuda_tpl.hpp | 2 +- .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 24 +++++- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 77 +++++++++++-------- lapack/tpls/KokkosLapack_magma.hpp | 39 ++++++++++ .../tpls/KokkosLapack_trtri_tpl_spec_decl.hpp | 5 +- lapack/unit_test/Test_Lapack_gesv.hpp | 18 ++++- 9 files changed, 171 insertions(+), 56 deletions(-) create mode 100644 lapack/tpls/KokkosLapack_magma.hpp diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index 26c529f3b7..9ddcd106df 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -26,12 +26,12 @@ namespace KokkosBlas { /// \brief Swaps the entries of vectors x and y. /// /// \tparam execution_space an execution space to perform parallel work -/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. -/// \tparam YVector Type of the first vector y; a 1-D Kokkos::View. +/// \tparam XVector Type of the first vector x; a rank 1 Kokkos::View. +/// \tparam YVector Type of the first vector y; a rank 1 Kokkos::View. /// /// \param space [in] execution space passed to execution policies -/// \param x [in/out] 1-D View. -/// \param y [in/out] 1-D View. +/// \param x [in/out] rank 1 View. +/// \param y [in/out] rank 1 View. /// /// Swaps x and y. Note that this is akin to performing a deep_copy, swapping /// pointers inside view can only be performed if no aliasing, subviews, etc... @@ -100,11 +100,11 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { /// \brief Swaps the entries of vectors x and y. /// -/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. -/// \tparam YVector Type of the first vector y; a 1-D Kokkos::View. +/// \tparam XVector Type of the first vector x; a rank 1 Kokkos::View. +/// \tparam YVector Type of the first vector y; a rank 1 Kokkos::View. /// -/// \param x [in/out] 1-D View. -/// \param y [in/out] 1-D View. +/// \param x [in/out] rank 1 View. +/// \param y [in/out] rank 1 View. /// /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking. Note that the kernel will be diff --git a/blas/unit_test/Test_Blas1_swap.hpp b/blas/unit_test/Test_Blas1_swap.hpp index 382c35947b..2cf1a4ab8b 100644 --- a/blas/unit_test/Test_Blas1_swap.hpp +++ b/blas/unit_test/Test_Blas1_swap.hpp @@ -3,11 +3,12 @@ namespace Test { namespace Impl { -template +template void test_swap(int const vector_length) { - using vector_type = VectorType; - using execution_space = typename vector_type::execution_space; - using scalar_type = typename VectorType::non_const_value_type; + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::execution_space; + using vector_type = Kokkos::View; + using scalar_type = typename vector_type::non_const_value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; // Note that Xref and Yref need to always be copies of X and Y @@ -43,14 +44,12 @@ void test_swap(int const vector_length) { } // namespace Impl } // namespace Test -template +template int test_swap() { - using Vector = Kokkos::View; - - Test::Impl::test_swap(0); - Test::Impl::test_swap(10); - Test::Impl::test_swap(256); - Test::Impl::test_swap(1024); + Test::Impl::test_swap(0); + Test::Impl::test_swap(10); + Test::Impl::test_swap(256); + Test::Impl::test_swap(1024); return 0; } diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp index a37cfd95fe..b66583bbdf 100644 --- a/lapack/src/KokkosLapack_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -40,6 +40,8 @@ namespace KokkosLapack { /// 1-D or 2-D Kokkos::View. /// \tparam IPIVV Output pivot indices, as a 1-D Kokkos::View /// +/// \param space [in] execution space instance used to specified how to execute +/// the gesv kernels. /// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the /// factors L and U from /// the factorization A = P*L*U; the unit diagonal elements of L are not @@ -166,6 +168,29 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, } } +/// \brief Solve the dense linear equation system A*X = B. +/// +/// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View. +/// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a +/// 1-D or 2-D Kokkos::View. +/// \tparam IPIVV Output pivot indices, as a 1-D Kokkos::View +/// +/// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the +/// factors L and U from +/// the factorization A = P*L*U; the unit diagonal elements of L are not +/// stored. +/// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, +/// the solution (multi)vector X. +/// \param IPIV [out] On exit, the pivot indices (for partial pivoting). +/// If the View extents are zero and its data pointer is NULL, pivoting is not +/// used. +/// +template +void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { + typename AMatrix::execution_space space{}; + gesv(space, A, B, IPIV); +} + } // namespace KokkosLapack #endif // KOKKOSLAPACK_GESV_HPP_ diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 8f61c7f767..6749a4740f 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -40,7 +40,7 @@ CudaLapackSingleton& CudaLapackSingleton::singleton() { #endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) -#include +#include namespace KokkosLapack { namespace Impl { diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index b7c336681f..9fbd299ca5 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -50,10 +50,15 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif +} // namespace Impl +} // namespace KokkosLapack // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "magma_v2.h" +namespace KokkosLapack { +namespace Impl { #define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ template <> \ struct gesv_tpl_spec_avail< \ @@ -62,7 +67,9 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -75,9 +82,9 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif } // namespace Impl } // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA // CUSOLVER #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER @@ -106,6 +113,19 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + } // namespace Impl } // namespace KokkosLapack #endif // CUSOLVER diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 82f7aea64a..3356559a84 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -138,23 +138,35 @@ KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) #endif +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +#endif + } // namespace Impl } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include +#include namespace KokkosLapack { namespace Impl { #define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ + template <> \ struct GESV< \ - Kokkos::View, \ + Kokkos::Cuda, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View { \ typedef double SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ @@ -176,8 +188,8 @@ namespace Impl { Kokkos::MemoryTraits> \ PViewType; \ \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ + static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ @@ -209,11 +221,12 @@ namespace Impl { }; #define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ + template <> \ struct GESV< \ - Kokkos::View, \ + Kokkos::Cuda, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View { \ typedef float SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ @@ -235,8 +248,8 @@ namespace Impl { Kokkos::MemoryTraits> \ PViewType; \ \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ + static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ gesv_print_specialization(); \ const bool with_pivot = \ @@ -268,12 +281,13 @@ namespace Impl { }; #define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ + template <> \ + struct GESV**, LAYOUT, \ + Kokkos::Device, \ Kokkos::MemoryTraits>, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits>, \ Kokkos::View { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ @@ -295,8 +309,8 @@ namespace Impl { Kokkos::MemoryTraits> \ PViewType; \ \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ + static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ @@ -329,12 +343,13 @@ namespace Impl { }; #define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ + template <> \ + struct GESV**, LAYOUT, \ + Kokkos::Device, \ Kokkos::MemoryTraits>, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits>, \ Kokkos::View { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ AViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits> \ BViewType; \ typedef Kokkos::View< \ @@ -356,8 +371,8 @@ namespace Impl { Kokkos::MemoryTraits> \ PViewType; \ \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ + static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ diff --git a/lapack/tpls/KokkosLapack_magma.hpp b/lapack/tpls/KokkosLapack_magma.hpp new file mode 100644 index 0000000000..dfde113fa6 --- /dev/null +++ b/lapack/tpls/KokkosLapack_magma.hpp @@ -0,0 +1,39 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_MAGMA_HPP_ +#define KOKKOSLAPACK_MAGMA_HPP_ + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "magma_v2.h" + +namespace KokkosLapack { +namespace Impl { + +// Declaration of the singleton for cusolver +// this is the only header that needs to be +// included when using cusolverDn. +struct MagmaSingleton { + MagmaSingleton(); + + static MagmaSingleton& singleton(); +}; + +} // namespace Impl +} // namespace KokkosLapack +#endif + +#endif // KOKKOSLAPACK_MAGMA_HPP_ diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 3ed0623018..b7e9c6e341 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -18,7 +18,10 @@ #define KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ #include "KokkosLapack_Host_tpl.hpp" // trtri prototype -//#include "KokkosLapack_tpl_spec.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosLapack_magma.hpp" +#endif namespace KokkosLapack { namespace Impl { diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 318f9f06ae..31bc0e708b 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -268,6 +268,13 @@ int test_gesv(const char* mode) { using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ + defined(TEST_THREADS_LAPACK_CPP))) Test::impl_test_gesv( &mode[0], "N", 2); // no padding Test::impl_test_gesv( @@ -279,7 +286,7 @@ int test_gesv(const char* mode) { Test::impl_test_gesv( &mode[0], "N", 1024); // no padding -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { Test::impl_test_gesv( @@ -316,6 +323,13 @@ int test_gesv_mrhs(const char* mode) { using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ + defined(TEST_THREADS_LAPACK_CPP))) Test::impl_test_gesv_mrhs( &mode[0], "N", 2, 5); // no padding Test::impl_test_gesv_mrhs( @@ -328,7 +342,7 @@ int test_gesv_mrhs(const char* mode) { &mode[0], "N", 1024, 5); // no padding // When appropriate run MAGMA specific tests -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { Test::impl_test_gesv_mrhs( From 2b023de73d100746cf35eb50af75923b16df85d2 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 27 Nov 2023 13:06:17 -0700 Subject: [PATCH 099/326] Update blas/unit_test/Test_Blas1_swap.hpp Co-authored-by: brian-kelley --- blas/unit_test/Test_Blas1_swap.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_swap.hpp b/blas/unit_test/Test_Blas1_swap.hpp index 2cf1a4ab8b..624552f1dc 100644 --- a/blas/unit_test/Test_Blas1_swap.hpp +++ b/blas/unit_test/Test_Blas1_swap.hpp @@ -6,7 +6,7 @@ namespace Impl { template void test_swap(int const vector_length) { using execution_space = typename DeviceType::execution_space; - using memory_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; using vector_type = Kokkos::View; using scalar_type = typename vector_type::non_const_value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; From c64c7eb70337e7acf5799025c00f5239421fc524 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 27 Nov 2023 14:00:18 -0700 Subject: [PATCH 100/326] cmake: Add workaround check for CUSOLVER support with Trilinos TPL_ENABLE_CUDA default enables CUBLAS and CUSOLVER in Trilinos, but not CUSPARSE This PR modifies the TPL requirement checks to maintain compatibility with existing configration options of Trilinos Attempt to resolve/workaround issue #2047 --- cmake/kokkoskernels_features.cmake | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/cmake/kokkoskernels_features.cmake b/cmake/kokkoskernels_features.cmake index cbd2a848ef..50acd02ed5 100644 --- a/cmake/kokkoskernels_features.cmake +++ b/cmake/kokkoskernels_features.cmake @@ -40,10 +40,23 @@ ELSEIF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCB MESSAGE(FATAL_ERROR "rocSOLVER requires rocBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCBLAS:BOOL=ON.") ENDIF() -IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS and cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") -ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - MESSAGE(FATAL_ERROR "cuSOLVER requires cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") -ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS) - MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON.") +# TPL_ENABLE_CUDA default enables CUBLAS and CUSOLVER in Trilinos, but not CUSPARSE. CUSPARSE is a required TPL for CUSOLVER support in KokkosKernels. +IF (KOKKOSKERNELS_HAS_TRILINOS AND TPL_ENABLE_CUDA) + IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(WARNING "cuSOLVER requires cuBLAS and cuSPARSE, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON to use.") + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(WARNING "cuSOLVER requires cuSPARSE, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON to use.") + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS) + MESSAGE(WARNING "cuSOLVER requires cuBLAS, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON to use.") + ENDIF() + # Disable CUSOLVER in KokkosKernels if TPL dependency requirements are not met. This is a compatibility workaround to allow existing configuration options for Trilinos to continue working. + SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) +ELSE() + IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS and cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON.") + ENDIF() ENDIF() From 0261159b7d4fb8e38d6cf03318cf62c5eb71dce8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 27 Nov 2023 14:03:55 -0700 Subject: [PATCH 101/326] Addressing Brian Kelley's feedbacks --- perf_test/sparse/KokkosSparse_gs.cpp | 6 +++--- perf_test/sparse/KokkosSparse_pcg.cpp | 2 +- sparse/src/KokkosKernels_Handle.hpp | 8 ++++---- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 9 +++++++-- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 8 ++++---- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index deb47796c6..163fdb2dd1 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -59,7 +59,7 @@ struct GS_Parameters { // Point: int longRowThreshold = 0; // Cluster: - ClusteringAlgorithm coarse_algo = ClusteringAlgorithm::CLUSTER_DEFAULT; + ClusteringAlgorithm coarse_algo = CLUSTER_DEFAULT; int cluster_size = 10; // Two stage: bool classic = false; @@ -455,9 +455,9 @@ int main(int argc, char** argv) { else if (!strcmp(argv[i], "--coarse-algo")) { const char* algo = getNextArg(i, argc, argv); if (!strcmp(algo, "balloon")) - params.coarse_algo = ClusteringAlgorithm::CLUSTER_BALLOON; + params.coarse_algo = CLUSTER_BALLOON; else if (!strcmp(algo, "mis2")) - params.coarse_algo = ClusteringAlgorithm::CLUSTER_MIS2; + params.coarse_algo = CLUSTER_MIS2; else { std::cout << "Error: invalid coarsening algorithm. Options are balloon " "and mis2.\n"; diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 44519a7ad0..e550eb674a 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -83,7 +83,7 @@ void run_experiment(crsMat_t crsmat, int clusterSize, bool useSequential) { if (clusterSize == 1) kh.create_gs_handle(); else - kh.create_gs_handle(KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON, + kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, clusterSize); Kokkos::Timer timer1; KokkosKernels::Experimental::Example::pcgsolve( diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index d500f19d48..4f33795018 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -747,10 +747,10 @@ class KokkosKernelsHandle { * * @param clusterAlgo Specifies which clustering algorithm to use: * - * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? - * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * KokkosSparse::CLUSTER_DEFAULT ?? + * KokkosSparse::CLUSTER_MIS2 ?? + * KokkosSparse::CLUSTER_BALLOON ?? + * KokkosSparse::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: * diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 7c10ea6eb8..3a264d2ef7 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -36,10 +36,15 @@ enum struct ClusteringAlgorithm { NUM_CLUSTERING_ALGORITHMS }; +static constexpr ClusteringAlgorithm CLUSTER_DEFAULT = ClusteringAlgorithm::CLUSTER_DEFAULT; +static constexpr ClusteringAlgorithm CLUSTER_MIS2 = ClusteringAlgorithm::CLUSTER_MIS2; +static constexpr ClusteringAlgorithm CLUSTER_BALLOON = ClusteringAlgorithm::CLUSTER_BALLOON; +static constexpr ClusteringAlgorithm NUM_CLUSTERING_ALGORITHMS = ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; + inline const char *getClusterAlgoName(ClusteringAlgorithm ca) { switch (ca) { - case ClusteringAlgorithm::CLUSTER_BALLOON: return "Balloon"; - case ClusteringAlgorithm::CLUSTER_MIS2: return "MIS(2)"; + case CLUSTER_BALLOON: return "Balloon"; + case CLUSTER_MIS2: return "MIS(2)"; default:; } return "INVALID CLUSTERING ALGORITHM"; diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index c398a1f759..61ae8204d3 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -106,7 +106,7 @@ void run_gauss_seidel( int cluster_size = 1, bool classic = false, // only with two-stage, true for sptrsv instead of richardson - ClusteringAlgorithm clusterAlgo = ClusteringAlgorithm::CLUSTER_DEFAULT, + ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT, KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT) { using size_type = typename crsMat_t::size_type; @@ -247,7 +247,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; std::vector clusteringAlgos = { - ClusteringAlgorithm::CLUSTER_MIS2, ClusteringAlgorithm::CLUSTER_BALLOON}; + CLUSTER_MIS2, CLUSTER_BALLOON}; for (int csize = 0; csize < 3; csize++) { for (auto clusterAlgo : clusteringAlgos) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { @@ -351,7 +351,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, int clusterSizes[3] = {2, 5, 34}; for (int csize = 0; csize < 3; csize++) { for (int algo = 0; - algo < (int)ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; algo++) { + algo < (int)NUM_CLUSTERING_ALGORITHMS; algo++) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; // Zero out X before solving @@ -554,7 +554,7 @@ void test_gauss_seidel_empty() { for (const int rowmapLen : {0, 1, 5}) { KernelHandle kh; if (doingCluster) - kh.create_gs_handle(ClusteringAlgorithm::CLUSTER_DEFAULT, 10); + kh.create_gs_handle(CLUSTER_DEFAULT, 10); else kh.create_gs_handle(GS_DEFAULT); const auto nRows = KOKKOSKERNELS_MACRO_MAX(0, rowmapLen - 1); From 16e327e82f0bf96f215eb274d6a1facdf9a5cf44 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 27 Nov 2023 14:07:21 -0700 Subject: [PATCH 102/326] Formatting --- perf_test/sparse/KokkosSparse_pcg.cpp | 3 +-- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 12 ++++++++---- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 7 +++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index e550eb674a..9825f7c90d 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -83,8 +83,7 @@ void run_experiment(crsMat_t crsmat, int clusterSize, bool useSequential) { if (clusterSize == 1) kh.create_gs_handle(); else - kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, - clusterSize); + kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, clusterSize); Kokkos::Timer timer1; KokkosKernels::Experimental::Example::pcgsolve( kh, crsmat, kok_b_vector, kok_x_vector, cg_iteration_limit, diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 3a264d2ef7..624382ec5b 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -36,10 +36,14 @@ enum struct ClusteringAlgorithm { NUM_CLUSTERING_ALGORITHMS }; -static constexpr ClusteringAlgorithm CLUSTER_DEFAULT = ClusteringAlgorithm::CLUSTER_DEFAULT; -static constexpr ClusteringAlgorithm CLUSTER_MIS2 = ClusteringAlgorithm::CLUSTER_MIS2; -static constexpr ClusteringAlgorithm CLUSTER_BALLOON = ClusteringAlgorithm::CLUSTER_BALLOON; -static constexpr ClusteringAlgorithm NUM_CLUSTERING_ALGORITHMS = ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; +static constexpr ClusteringAlgorithm CLUSTER_DEFAULT = + ClusteringAlgorithm::CLUSTER_DEFAULT; +static constexpr ClusteringAlgorithm CLUSTER_MIS2 = + ClusteringAlgorithm::CLUSTER_MIS2; +static constexpr ClusteringAlgorithm CLUSTER_BALLOON = + ClusteringAlgorithm::CLUSTER_BALLOON; +static constexpr ClusteringAlgorithm NUM_CLUSTERING_ALGORITHMS = + ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; inline const char *getClusterAlgoName(ClusteringAlgorithm ca) { switch (ca) { diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 61ae8204d3..89370f0dc5 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -246,8 +246,8 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, } //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; - std::vector clusteringAlgos = { - CLUSTER_MIS2, CLUSTER_BALLOON}; + std::vector clusteringAlgos = {CLUSTER_MIS2, + CLUSTER_BALLOON}; for (int csize = 0; csize < 3; csize++) { for (auto clusterAlgo : clusteringAlgos) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { @@ -350,8 +350,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, //*** Cluster-coloring version **** int clusterSizes[3] = {2, 5, 34}; for (int csize = 0; csize < 3; csize++) { - for (int algo = 0; - algo < (int)NUM_CLUSTERING_ALGORITHMS; algo++) { + for (int algo = 0; algo < (int)NUM_CLUSTERING_ALGORITHMS; algo++) { for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; // Zero out X before solving From 89d149e8d18f4369a943c0d26eed03dba06cee53 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 27 Nov 2023 14:09:36 -0700 Subject: [PATCH 103/326] Removing 'ClusteringAlgorithm::' --- sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 6cf8df991c..501e71e3e7 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -555,23 +555,22 @@ class ClusterGaussSeidel { } nnz_view_t vertClusters; auto clusterAlgo = gsHandle->get_clustering_algo(); - if (clusterAlgo == ClusteringAlgorithm::CLUSTER_DEFAULT) - clusterAlgo = ClusteringAlgorithm::CLUSTER_MIS2; + if (clusterAlgo == CLUSTER_DEFAULT) clusterAlgo = CLUSTER_MIS2; switch (clusterAlgo) { - case ClusteringAlgorithm::CLUSTER_MIS2: { + case CLUSTER_MIS2: { vertClusters = KokkosGraph::graph_mis2_aggregate( raw_sym_xadj, raw_sym_adj, numClusters); break; } - case ClusteringAlgorithm::CLUSTER_BALLOON: { + case CLUSTER_BALLOON: { BalloonClustering balloon( num_rows, raw_sym_xadj, raw_sym_adj); vertClusters = balloon.run(clusterSize); break; } - case ClusteringAlgorithm::CLUSTER_DEFAULT: { + case CLUSTER_DEFAULT: { throw std::logic_error( "Logic to choose default clustering algorithm is incorrect"); } From 0599b37e4da8505f05d28d7dc27154969ce7a932 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Nov 2023 20:22:25 -0700 Subject: [PATCH 104/326] Lapack: gesv, incorporate Brian's feedback --- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 359 ++++++------------ 1 file changed, 111 insertions(+), 248 deletions(-) diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 3356559a84..36bdcaedcb 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -160,261 +160,124 @@ KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GESV< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - BViewType; \ - typedef Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - PViewType; \ - \ - static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ - \ - if (with_pivot) { \ - magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), \ - LDA, IPIV.data(), \ - reinterpret_cast(B.data()), LDB, \ - &info); \ - } else { \ - magma_dgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +template +void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, + const BViewType& B, const IPIVViewType& IPIV) { + using scalar_type = typename AViewType::non_const_value_type; + + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + Kokkos::ArithTraits::name() + "]"); + gesv_print_specialization(); + + const bool with_pivot = + !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); + + magma_int_t N = static_cast(A.extent(1)); + magma_int_t AST = static_cast(A.stride(1)); + magma_int_t LDA = (AST == 0) ? 1 : AST; + magma_int_t BST = static_cast(B.stride(1)); + magma_int_t LDB = (BST == 0) ? 1 : BST; + magma_int_t NRHS = static_cast(B.extent(1)); + + KokkosLapack::Impl::MagmaSingleton& s = + KokkosLapack::Impl::MagmaSingleton::singleton(); + magma_int_t info = 0; + + space.fence(); + if constexpr (std::is_same_v) { + if (with_pivot) { + magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), + LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, + &info); + } else { + magma_sgesv_nopiv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); + } + } -#define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GESV< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - BViewType; \ - typedef Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - PViewType; \ - \ - static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ - \ - if (with_pivot) { \ - magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), \ - LDA, IPIV.data(), \ - reinterpret_cast(B.data()), LDB, \ - &info); \ - } else { \ - magma_sgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; + if constexpr (std::is_same_v) { + if (with_pivot) { + magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), + LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, + &info); + } else { + magma_dgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); + } + } -#define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - BViewType; \ - typedef Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - PViewType; \ - \ - static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ - \ - if (with_pivot) { \ - magma_zgesv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - IPIV.data(), reinterpret_cast(B.data()), \ - LDB, &info); \ - } else { \ - magma_zgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; + if constexpr (std::is_same_v>) { + if (with_pivot) { + magma_cgesv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), + LDB, &info); + } else { + magma_cgesv_nopiv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); + } + } -#define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - BViewType; \ - typedef Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - PViewType; \ - \ - static void gesv(const Kokkos::Cuda& /*space*/, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ + if constexpr (std::is_same_v>) { + if (with_pivot) { + magma_zgesv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), + LDB, &info); + } else { + magma_zgesv_nopiv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); + } + } + ExecSpace().fence(); + Kokkos::Profiling::popRegion(); +} + +#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ \ - if (with_pivot) { \ - magma_cgesv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - IPIV.data(), reinterpret_cast(B.data()), \ - LDB, &info); \ - } else { \ - magma_cgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + magmaGesvWrapper(space, A, B, IPIV); \ } \ }; -KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) } // namespace Impl } // namespace KokkosLapack From e04272e4aec9ae3c0766c23490f21be93f4c805b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Nov 2023 20:32:11 -0700 Subject: [PATCH 105/326] Applying clang-format --- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 120 +++++++++--------- 1 file changed, 63 insertions(+), 57 deletions(-) diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 36bdcaedcb..41592e079a 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -162,14 +162,14 @@ namespace Impl { template void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, - const BViewType& B, const IPIVViewType& IPIV) { + const BViewType& B, const IPIVViewType& IPIV) { using scalar_type = typename AViewType::non_const_value_type; - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + Kokkos::ArithTraits::name() + "]"); + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + + Kokkos::ArithTraits::name() + "]"); gesv_print_specialization(); - const bool with_pivot = - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); + const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); magma_int_t N = static_cast(A.extent(1)); magma_int_t AST = static_cast(A.stride(1)); @@ -179,32 +179,31 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, magma_int_t NRHS = static_cast(B.extent(1)); KokkosLapack::Impl::MagmaSingleton& s = - KokkosLapack::Impl::MagmaSingleton::singleton(); + KokkosLapack::Impl::MagmaSingleton::singleton(); magma_int_t info = 0; space.fence(); if constexpr (std::is_same_v) { if (with_pivot) { - magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), - LDA, IPIV.data(), - reinterpret_cast(B.data()), LDB, - &info); + magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), + LDB, &info); } else { - magma_sgesv_nopiv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_sgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), + LDA, reinterpret_cast(B.data()), + LDB, &info); } } if constexpr (std::is_same_v) { if (with_pivot) { - magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), - LDA, IPIV.data(), - reinterpret_cast(B.data()), LDB, - &info); + magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), + LDB, &info); } else { - magma_dgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_dgesv_nopiv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } @@ -212,8 +211,8 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, if (with_pivot) { magma_cgesv_gpu( N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), - LDB, &info); + IPIV.data(), reinterpret_cast(B.data()), LDB, + &info); } else { magma_cgesv_nopiv_gpu( N, NRHS, reinterpret_cast(A.data()), LDA, @@ -225,8 +224,8 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, if (with_pivot) { magma_zgesv_gpu( N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), - LDB, &info); + IPIV.data(), reinterpret_cast(B.data()), LDB, + &info); } else { magma_zgesv_nopiv_gpu( N, NRHS, reinterpret_cast(A.data()), LDA, @@ -237,47 +236,54 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, Kokkos::Profiling::popRegion(); } -#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ +#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View< \ + magma_int_t*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ \ - static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - magmaGesvWrapper(space, A, B, IPIV); \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + magmaGesvWrapper(space, A, B, IPIV); \ } \ }; KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) } // namespace Impl } // namespace KokkosLapack From f61df4737c94220b22afa36ba5f308cd84f1dad4 Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Tue, 28 Nov 2023 10:43:26 -0600 Subject: [PATCH 106/326] Fixing some deprecation warnings/errors for ROCm 6 --- sparse/src/KokkosSparse_Utils_rocsparse.hpp | 4 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 40 ++++++++++++++++++- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 12 ++++-- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index cc34e55093..b9199af8dd 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -21,8 +21,8 @@ #include #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE -#include -#include "rocsparse/rocsparse.h" +#include +#include namespace KokkosSparse { namespace Impl { diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 97019e4682..75752190e7 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -869,8 +869,46 @@ void spmv_block_impl_rocsparse( rocsparse_mat_info info; KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info)); + // *_ex* functions deprecated in introduced in 6+ +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } // *_ex* functions introduced in 5.4.0 -#if KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 +#elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 if constexpr (std::is_same_v) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index e12ee23937..6d8513f83c 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -359,8 +359,6 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, // rocSPARSE #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) -#include -#include #include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { @@ -443,7 +441,15 @@ void spmv_rocsparse(const Kokkos::HIP& exec, alg = rocsparse_spmv_alg_csr_stream; } -#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_buffer_size, &buffer_size, tmp_buffer)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_compute, &buffer_size, tmp_buffer)); +#elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); From 4d1cfe213db9e7d343a6457221e447d9ca107b4d Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 29 Nov 2023 09:34:11 -0700 Subject: [PATCH 107/326] BLAS: fix bug in TPL layer of KokkosBlas::swap The cuBLAS Kokkos::complex specialization had a small bug where the rank of the view was not specified correctly! --- blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp index 49ae14ad9d..555c942c12 100644 --- a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp @@ -293,10 +293,10 @@ namespace Impl { Kokkos::Device, \ Kokkos::MemoryTraits>, \ true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View, LAYOUT, \ + using XVector = Kokkos::View*, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View, LAYOUT, \ + using YVector = Kokkos::View*, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits>; \ static void swap(EXECSPACE const& space, XVector const& X, \ From 2ecf6755959216dbfca4eb0d15ac6070c69586f6 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Wed, 27 Sep 2023 17:02:25 -0500 Subject: [PATCH 108/326] CMake: fix bugs in deciding KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX --- CheckHostBlasReturnComplex.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index b9528ce45a..657a9f2286 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -21,8 +21,8 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) extern \"C\" { void F77_BLAS_MANGLE(zdotc,ZDOTC)( - std::complex* result, const int* n, - const std::complex x[], const int* incx, + std::complex* result, const int* n, + const std::complex x[], const int* incx, const std::complex y[], const int* incy); } @@ -49,9 +49,9 @@ int main() { CHECK_CXX_SOURCE_RUNS("${SOURCE}" KK_BLAS_RESULT_AS_POINTER_ARG) IF(${KK_BLAS_RESULT_AS_POINTER_ARG}) - SET(VARNAME OFF) + SET(${VARNAME} OFF PARENT_SCOPE) ELSE() - SET(VARNAME ON) + SET(${VARNAME} ON PARENT_SCOPE) ENDIF() ENDFUNCTION() From eebdbb2476a10de3864ba13665d4b9ea439058d8 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Tue, 22 Aug 2023 16:45:14 -0500 Subject: [PATCH 109/326] TPL: revise BLAS1 dot implementation --- blas/src/KokkosBlas1_dot.hpp | 44 +- blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 34 +- blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 436 ++++++++----------- 3 files changed, 223 insertions(+), 291 deletions(-) diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index ebccce7d7c..aa995836eb 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -96,25 +96,37 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { Kokkos::View>; - result_type result{}; - RVector_Result R = RVector_Result(&result); XVector_Internal X = x; YVector_Internal Y = y; - // Even though RVector is the template parameter, Dot::dot has an overload - // that accepts RVector_Internal (with the special accumulator, if dot_type is - // 32-bit precision). Impl::Dot needs to support both cases, and it's easier - // to do this with overloading than by extending the ETI to deal with two - // different scalar types. - Impl::DotSpecialAccumulator::dot(space, R, - X, Y); - space.fence(); - // mfh 22 Jan 2020: We need the line below because - // Kokkos::complex lacks a constructor that takes a - // Kokkos::complex with U != T. - return Kokkos::Details::CastPossiblyComplex::cast( - result); + bool useFallback = false; + if (useFallback) { + // Even though RVector is the template parameter, Dot::dot has an overload + // that accepts RVector_Internal (with the special accumulator, if dot_type + // is 32-bit precision). Impl::Dot needs to support both cases, and it's + // easier to do this with overloading than by extending the ETI to deal with + // two different scalar types. + result_type result{}; + RVector_Result R = RVector_Result(&result); + Impl::DotSpecialAccumulator::dot(space, + R, X, + Y); + space.fence(); + // mfh 22 Jan 2020: We need the line below because + // Kokkos::complex lacks a constructor that takes a + // Kokkos::complex with U != T. + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } else { + dot_type result{}; + RVector_Internal R = RVector_Internal(&result); + Impl::Dot::dot(space, R, X, Y); + space.fence(); + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } } /// \brief Return the dot product of the two vectors x and y. diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index ca2139980d..51f6e5965c 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -59,11 +59,7 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ template <> \ struct dot_tpl_spec_avail< \ EXECSPACE, \ @@ -77,19 +73,27 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace) +#endif } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 718e32f14c..ace26ebdbd 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -39,71 +39,40 @@ inline void dot_print_specialization() { namespace KokkosBlas { namespace Impl { - -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Dot< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ dot_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ + R() = HostBlas::dot( \ + N, reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -112,105 +81,22 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::dot( \ - N, reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::dot( \ - N, reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) } // namespace Impl } // namespace KokkosBlas - #endif // cuBLAS @@ -219,38 +105,48 @@ KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { - -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ + const int N = static_cast(numElems); \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -259,81 +155,73 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ + Kokkos::Cuda, Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) +} // namespace Impl +} // namespace KokkosBlas +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ +namespace KokkosBlas { +namespace Impl { +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZdotc(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -342,72 +230,100 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_sdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_ddot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_cdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_zdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) +} // namespace Impl +} // namespace KokkosBlas +#endif + +// ONEMKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCdotc(s.handle, N, reinterpret_cast(X.data()), \ - one, reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const std::int64_t N = static_cast(numElems); \ + TPL_DOT(exec.sycl_queue(), N, \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R())); \ } else { \ - Dot::dot(space, R, \ + Dot::dot(exec, R, \ X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(false) } // namespace Impl } // namespace KokkosBlas - #endif #endif From f5415f8a792100b1cca985b8bab335c42ab810fc Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Thu, 28 Sep 2023 17:04:29 -0500 Subject: [PATCH 110/326] Fix compile errors for C-linkage dot functions returning std::complex --- blas/tpls/KokkosBlas_Host_tpl.cpp | 40 ++++++++++++++++++------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 71e22a690c..98da43e1d8 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -89,22 +89,30 @@ double F77_BLAS_MANGLE(ddot, DDOT)(const int* N, const double* x, const int* x_inc, const double* y, const int* y_inc); #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) -std::complex F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -std::complex F77_BLAS_MANGLE(zdotu, ZDOTU)( - const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); -std::complex F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -std::complex F77_BLAS_MANGLE(zdotc, ZDOTC)( - const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); +// clang-format off +// use C complex types as return types instead of std::complex, otherwise compiler will complain +// error: 'cdotu_' has C-linkage specified, but returns user-defined type 'std::complex' which is incompatible with C [-Werror,-Wreturn-type-c-linkage]" +// clang-format on +float _Complex F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +double _Complex F77_BLAS_MANGLE(zdotu, ZDOTU)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +float _Complex F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +double _Complex F77_BLAS_MANGLE(zdotc, ZDOTC)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); #else void F77_BLAS_MANGLE(cdotu, CDOTU)(std::complex* res, const int* N, From 3cd64206e79d1cc372cc4b53cfa4f1048131fa00 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Thu, 5 Oct 2023 16:35:48 -0500 Subject: [PATCH 111/326] Use a C struct for complex numbers to avoid error: '_Complex' is a C99 extension [-Werror,-Wc99-extensions]. --- blas/tpls/KokkosBlas_Host_tpl.cpp | 57 ++++++++++++++++++------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 98da43e1d8..ec739aa98a 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -90,29 +90,38 @@ double F77_BLAS_MANGLE(ddot, DDOT)(const int* N, const double* x, const int* y_inc); #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) // clang-format off -// use C complex types as return types instead of std::complex, otherwise compiler will complain +// For the return type, don't use std::complex, otherwise compiler will complain // error: 'cdotu_' has C-linkage specified, but returns user-defined type 'std::complex' which is incompatible with C [-Werror,-Wreturn-type-c-linkage]" +// But with float _Complex, I got error: '_Complex' is a C99 extension [-Werror,-Wc99-extensions]. +// So I just use a C struct. // clang-format on -float _Complex F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -double _Complex F77_BLAS_MANGLE(zdotu, ZDOTU)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -float _Complex F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -double _Complex F77_BLAS_MANGLE(zdotc, ZDOTC)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); +typedef struct { + float vals[2]; +} _kk_float2; +typedef struct { + double vals[2]; +} _kk_double2; + +_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const int* N, + const std::complex* x, + const int* x_inc, + const std::complex* y, + const int* y_inc); #else void F77_BLAS_MANGLE(cdotu, CDOTU)(std::complex* res, const int* N, @@ -803,7 +812,8 @@ std::complex HostBlas >::dot( int n, const std::complex* x, int x_inc, const std::complex* y, int y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) - return F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); + _kk_float2 res = F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); + return std::complex(res.vals[0], res.vals[1]); #else std::complex res; F77_FUNC_CDOTC(&res, &n, x, &x_inc, y, &y_inc); @@ -975,7 +985,8 @@ std::complex HostBlas >::dot( int n, const std::complex* x, int x_inc, const std::complex* y, int y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) - return F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); + _kk_double2 res = F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); + return std::complex(res.vals[0], res.vals[1]); #else std::complex res; F77_FUNC_ZDOTC(&res, &n, x, &x_inc, y, &y_inc); From d2e752485ed8164072c189df92bb84bf437ce34b Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Mon, 6 Nov 2023 11:24:58 -0600 Subject: [PATCH 112/326] Add a workaround by disabling host MKL dot with complex numbers --- blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 51f6e5965c..3ba8f063b4 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -52,10 +52,18 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) + +// TODO: we met difficuties in FindTPLMKL.cmake to set the BLAS library properly +// such that the test in CheckHostBlasReturnComplex.cmake could not be +// compiled and run to give a correct answer on KK_BLAS_RESULT_AS_POINTER_ARG. +// This resulted in segfault in dot() with MKL and complex. +// So we just temporarily disable it until FindTPLMKL.cmake is fixed. +#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif #endif From 745a7b235b9756a8c9ec6d89c28e1337df409983 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 1 Dec 2023 15:11:16 -0700 Subject: [PATCH 113/326] Allow KokkosKernels_ENABLE_PERFTESTS=ON to build perf_tests without KokkosKernels_ENABLE_TESTS=ON --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb5d0591d6..40a4595745 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -427,7 +427,7 @@ ELSE() IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) IF (KokkosKernels_ENABLE_PERFTESTS) MESSAGE(STATUS "Enabling perf tests.") - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) + add_subdirectory(perf_test) # doesn't require KokkosKernels_ENABLE_TESTS=ON ENDIF () IF (KokkosKernels_ENABLE_EXAMPLES) MESSAGE(STATUS "Enabling examples.") From 27082a96eb60469d88a028ce759ff88832ed5328 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 1 Dec 2023 15:28:27 -0700 Subject: [PATCH 114/326] format sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp --- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 6d8513f83c..99799ced5d 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -442,13 +442,15 @@ void spmv_rocsparse(const Kokkos::HIP& exec, } #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( - handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, - compute_type, alg, rocsparse_spmv_stage_buffer_size, &buffer_size, tmp_buffer)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, + vecY, compute_type, alg, rocsparse_spmv_stage_buffer_size, + &buffer_size, tmp_buffer)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( - handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, - compute_type, alg, rocsparse_spmv_stage_compute, &buffer_size, tmp_buffer)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, + vecY, compute_type, alg, rocsparse_spmv_stage_compute, + &buffer_size, tmp_buffer)); #elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, From 1bbf54962760c56658cf3ce85d1b4f7674a3b3e5 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Sat, 2 Dec 2023 15:08:30 -0700 Subject: [PATCH 115/326] cmake: fix tpl check so cusolver can be disabled when needed --- cmake/kokkoskernels_features.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/kokkoskernels_features.cmake b/cmake/kokkoskernels_features.cmake index 50acd02ed5..211c0c740e 100644 --- a/cmake/kokkoskernels_features.cmake +++ b/cmake/kokkoskernels_features.cmake @@ -42,15 +42,17 @@ ENDIF() # TPL_ENABLE_CUDA default enables CUBLAS and CUSOLVER in Trilinos, but not CUSPARSE. CUSPARSE is a required TPL for CUSOLVER support in KokkosKernels. IF (KOKKOSKERNELS_HAS_TRILINOS AND TPL_ENABLE_CUDA) + # Checks disable CUSOLVER in KokkosKernels if TPL dependency requirements are not met. This is a compatibility workaround to allow existing configuration options for Trilinos to continue working. IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) MESSAGE(WARNING "cuSOLVER requires cuBLAS and cuSPARSE, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON to use.") + SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) MESSAGE(WARNING "cuSOLVER requires cuSPARSE, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON to use.") + SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS) MESSAGE(WARNING "cuSOLVER requires cuBLAS, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON to use.") + SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) ENDIF() - # Disable CUSOLVER in KokkosKernels if TPL dependency requirements are not met. This is a compatibility workaround to allow existing configuration options for Trilinos to continue working. - SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) ELSE() IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS and cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") From c8b29991043ae1a7a3b817a8a867722c2a4543c6 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 4 Dec 2023 15:36:42 -0700 Subject: [PATCH 116/326] Link std::filesystem for IntelLLVM in perf_test/sparse --- perf_test/sparse/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 8a994b4122..ef0bf7d995 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -145,4 +145,9 @@ if (KokkosKernels_ENABLE_BENCHMARK) if (Kokkos_CXX_COMPILER_ID STREQUAL HIPCC AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 5.3) target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs) endif() + # IntelLLVM < 2023.1.0 (and possible higher versions too) have an underlying clang that has the std::filesystem + # in an experimental namespace and a different library + if (Kokkos_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 2023.1.0) + target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs) + endif() endif() From a52ba02ad66a04869c8757f1676ac031d467ea88 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 5 Dec 2023 10:11:23 -0700 Subject: [PATCH 117/326] gemm3 perf test: user CUDA, SYCL, or HIP device for kokkos:initialize --- ...osBlas3_gemm_standalone_perf_test_benchmark.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index d617ffcdf3..cd7f194071 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -142,7 +142,19 @@ void run(const blas3_gemm_params& params) { int main(int argc, char** argv) { const auto params = blas3_gemm_params::get_params(argc, argv); const int num_threads = params.use_openmp; - const int device_id = params.use_cuda - 1; + + // the common parameter parser takes the requested device ID and + // adds 1 to it (e.g. --cuda 0 -> params.use_cuda = 1) + // this is presumably so that 0 can be a sentinel value, + // even though device ID 0 is valid + // here, we use CUDA, SYCL, or HIP, whichever is set first, to + // choose which device Kokkos should initialize on + // or -1, for no such selection + const int device_id = + params.use_cuda + ? params.use_cuda - 1 + : (params.use_sycl ? params.use_sycl - 1 + : (params.use_hip ? params.use_hip - 1 : -1)); Kokkos::initialize(Kokkos::InitializationSettings() .set_num_threads(num_threads) From d1bf49932d79d38acb3e8e31f34d6665fe3c9392 Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Tue, 5 Dec 2023 19:12:23 -0600 Subject: [PATCH 118/326] Fix for rocm_verison header inclusion --- sparse/src/KokkosSparse_Utils_rocsparse.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index b9199af8dd..baf2d3a822 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -21,7 +21,11 @@ #include #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +#if __has_include() #include +#else +#include +#endif #include namespace KokkosSparse { From a91a1f24560692d6a294e764f82f497dc10aeb7d Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 8 Dec 2023 14:22:36 -0700 Subject: [PATCH 119/326] fence Kokkos before timed interations --- perf_test/sparse/KokkosSparse_spmv_benchmark.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index aeaa37db96..4ae0a34168 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -132,6 +132,7 @@ void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x, rand_pool, 10); Kokkos::fill_random(y, rand_pool, 10); + Kokkos::fence(); // Run the actual experiments for (auto _ : state) { From 71442841a80513c41ee5291afdea05611a5c7d25 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 13 Dec 2023 11:26:17 -0700 Subject: [PATCH 120/326] Deprecate KOKKOSLINALG_OPT_LEVEL --- BUILD.md | 2 +- CMakeLists.txt | 2 +- common/src/KokkosLinAlg_config.h | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/BUILD.md b/BUILD.md index 5be269bd7c..6fcea4dd33 100644 --- a/BUILD.md +++ b/BUILD.md @@ -227,7 +227,7 @@ endif() * KokkosKernels_LAPACK_ROOT: PATH * Location of LAPACK install root. * Default: None or the value of the environment variable LAPACK_ROOT if set -* KokkosKernels_LINALG_OPT_LEVEL: BOOL +* KokkosKernels_LINALG_OPT_LEVEL: BOOL **DEPRECATED** * Optimization level for KokkosKernels computational kernels: a nonnegative integer. Higher levels result in better performance that is more uniform for corner cases, but increase build time and library size. The default value is 1, which should give performance within ten percent of optimal on most platforms, for most problems. * Default: 1 * KokkosKernels_MAGMA_ROOT: PATH diff --git a/CMakeLists.txt b/CMakeLists.txt index 40a4595745..76b9b9039d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,7 +156,7 @@ ELSE() KOKKOSKERNELS_ADD_OPTION_AND_DEFINE( LINALG_OPT_LEVEL KOKKOSLINALG_OPT_LEVEL - "Optimization level for KokkosKernels computational kernels: a nonnegative integer. Higher levels result in better performance that is more uniform for corner cases, but increase build time and library size. The default value is 1, which should give performance within ten percent of optimal on most platforms, for most problems. Default: 1" + "DEPRECATED. Optimization level for KokkosKernels computational kernels: a nonnegative integer. Higher levels result in better performance that is more uniform for corner cases, but increase build time and library size. The default value is 1, which should give performance within ten percent of optimal on most platforms, for most problems. Default: 1" "1") # Enable experimental features of KokkosKernels if set at configure diff --git a/common/src/KokkosLinAlg_config.h b/common/src/KokkosLinAlg_config.h index fccfe799ca..fe97c1de8b 100644 --- a/common/src/KokkosLinAlg_config.h +++ b/common/src/KokkosLinAlg_config.h @@ -18,6 +18,8 @@ #ifndef KOKKOSLINALG_CONFIG_H #define KOKKOSLINALG_CONFIG_H +[[deprecated("KokkosLinAlg_config.h is deprecated!")]] + #include #endif // KOKKOSLINALG_CONFIG_H From 543446d289a351b60923845bd262979ab9a96294 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 13 Dec 2023 11:39:22 -0700 Subject: [PATCH 121/326] Add CMake warning message if KokkosKernels_LINALG_OPT_LEVEL is used --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 76b9b9039d..ab92213fe4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,6 +159,13 @@ ELSE() "DEPRECATED. Optimization level for KokkosKernels computational kernels: a nonnegative integer. Higher levels result in better performance that is more uniform for corner cases, but increase build time and library size. The default value is 1, which should give performance within ten percent of optimal on most platforms, for most problems. Default: 1" "1") + if (KokkosKernels_LINALG_OPT_LEVEL AND NOT KokkosKernels_LINALG_OPT_LEVEL STREQUAL "1") + message(WARNING "KokkosKernels_LINALG_OPT_LEVEL is deprecated!") + endif() + if(KokkosKernels_KOKKOSLINALG_OPT_LEVEL AND NOT KokkosKernels_KOKKOSLINALG_OPT_LEVEL STREQUAL "1") + message(WARNING "KokkosKernels_KOKKOSLINALG_OPT_LEVEL is deprecated!") + endif() + # Enable experimental features of KokkosKernels if set at configure # time. Default is no. KOKKOSKERNELS_ADD_OPTION_AND_DEFINE( From 272807196b3c14f602bdc08f3acd076b1af422f0 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 14 Dec 2023 08:54:14 -0700 Subject: [PATCH 122/326] Async matrix release for MKL >= 2023.2 --- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index e12ee23937..abda57d0c4 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -733,9 +733,16 @@ struct spmv_onemkl_wrapper { auto ev_gemv = oneapi::mkl::sparse::gemv(exec.sycl_queue(), mkl_mode, alpha, handle, x.data(), beta, y.data(), {ev_opt}); + // MKL 2023.2 and up make this release okay async even though it takes a + // pointer to a stack variable +#if INTEL_MKL_VERSION >= 20230200 + oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &handle, + {ev_gemv}); +#else auto ev_release = oneapi::mkl::sparse::release_matrix_handle( exec.sycl_queue(), &handle, {ev_gemv}); ev_release.wait(); +#endif } }; @@ -768,9 +775,16 @@ struct spmv_onemkl_wrapper { reinterpret_cast*>( const_cast(x.data())), beta, reinterpret_cast*>(y.data()), {ev_opt}); + // MKL 2023.2 and up make this release okay async even though it takes a + // pointer to a stack variable +#if INTEL_MKL_VERSION >= 20230200 + oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &handle, + {ev_gemv}); +#else auto ev_release = oneapi::mkl::sparse::release_matrix_handle( exec.sycl_queue(), &handle, {ev_gemv}); ev_release.wait(); +#endif } }; From ddf425f8d136aebca2d4e5179a945c69eaf8b6e5 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 14 Dec 2023 08:46:36 -0800 Subject: [PATCH 123/326] Support CUBLAS_{LIBRARIES,LIBRARY_DIRS,INCLUDE_DIRS,ROOT} and KokkosKernels_CUBLAS_ROOT --- cmake/Modules/FindTPLCUBLAS.cmake | 57 ++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/cmake/Modules/FindTPLCUBLAS.cmake b/cmake/Modules/FindTPLCUBLAS.cmake index 890c2dac62..feb39d0373 100644 --- a/cmake/Modules/FindTPLCUBLAS.cmake +++ b/cmake/Modules/FindTPLCUBLAS.cmake @@ -1,18 +1,43 @@ -FIND_PACKAGE(CUDA) - -INCLUDE(FindPackageHandleStandardArgs) -IF (NOT CUDA_FOUND) - #Important note here: this find Module is named TPLCUBLAS - #The eventual target is named CUBLAS. To avoid naming conflicts - #the find module is called TPLCUBLAS. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_FOUND) -ELSE() - #The libraries might be empty - OR they might explicitly be not found - IF("${CUDA_CUBLAS_LIBRARIES}" MATCHES "NOTFOUND") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_CUBLAS_LIBRARIES) +if(CUBLAS_LIBRARIES AND CUBLAS_LIBRARY_DIRS AND CUBLAS_INCLUDE_DIRS) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES ${CUBLAS_LIBRARIES} + LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + HEADER_PATHS ${CUBLAS_INCLUDE_DIRS} + ) +elseif(CUBLAS_LIBRARIES AND CUBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES ${CUBLAS_LIBRARIES} + LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + ) +elseif(CUBLAS_LIBRARIES) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES ${CUBLAS_LIBRARIES} + ) +elseif(CUBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES cublas + LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + ) +elseif(CUBLAS_ROOT OR KokkosKernels_CUBLAS_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES cublas + ) +else() # backwards-compatible way + FIND_PACKAGE(CUDA) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUBLAS + #The eventual target is named CUBLAS. To avoid naming conflicts + #the find module is called TPLCUBLAS. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_FOUND) ELSE() - KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUBLAS INTERFACE - LINK_LIBRARIES "${CUDA_CUBLAS_LIBRARIES}") + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_CUBLAS_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_CUBLAS_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUBLAS INTERFACE + LINK_LIBRARIES "${CUDA_CUBLAS_LIBRARIES}") + ENDIF() ENDIF() -ENDIF() +endif() From d53066a38a6cc779636eca5e43339547e36ceb7d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 15 Dec 2023 14:42:12 -0700 Subject: [PATCH 124/326] KokkosSparse_spmv_impl_merge.hpp: use capture by reference Resolve warnings in builds with c++20 support enabled: "kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl_merge.hpp:166:81: warning: implicit capture of 'this' via '[=]' is deprecated in C++20 [-Wdeprecated]" --- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index 9329b8a097..c49519cc3a 100644 --- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -165,7 +165,7 @@ struct SpmvMergeHierarchical { pathLengthTeamChunk * sizeof(A_ordinal_type)); Kokkos::parallel_for( Kokkos::TeamThreadRange(thread, teamNnzBegin, teamNnzEnd), - [=](const A_ordinal_type& i) { + [&](const A_ordinal_type& i) { valuesS[i - teamNnzBegin] = A.values(i); entriesS[i - teamNnzBegin] = A.graph.entries(i); }); From c61f7087cac9863ec082d0588c2c6b9b76702303 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 15 Dec 2023 14:44:34 -0700 Subject: [PATCH 125/326] KokkosSparse_par_ilut_numeric_impl.hpp: use capture by reference Resolve warnings in builds with c++20 support enabled: "kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp(591): warning #2908-D: the implicit by-copy capture of "this" is deprecated" --- sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 0ac9c26166..9375039747 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -588,7 +588,7 @@ struct IlutWrap { count); Kokkos::single(Kokkos::PerTeam(team), - [=]() { O_row_map(row_idx) = count; }); + [&]() { O_row_map(row_idx) = count; }); } float_t threshold; From 49f5a61b033401cdb2802f073753df9ba21780ce Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 15 Dec 2023 22:37:22 -0700 Subject: [PATCH 126/326] Backup --- blas/unit_test/Test_Blas2_ger.hpp | 117 ++++++++++++++++-------------- 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 7d30a4b65d..deaffc2bbd 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -79,10 +79,11 @@ class GerTester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected, _ViewTypeX& x, - _ViewTypeY& y, _ViewTypeA& A, + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>/*_HostViewTypeX& h_x*/& x, + view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, + /*_HostViewTypeY& h_y, _HostViewTypeA& h_A,*/ + _ViewTypeExpected& h_expected, /*_ViewTypeX& x,*/ + /*_ViewTypeY& y, _ViewTypeA& A,*/ bool& expectedResultIsKnown); template @@ -150,8 +151,9 @@ class GerTester { template void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - _ViewTypeA& A, - const _HostViewTypeA& h_A, + view_stride_adapter<_ViewTypeA, false>& A, + /*_ViewTypeA& A,*/ + /*const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation); @@ -286,8 +288,8 @@ void GerTesterpopulateVariables(alpha, x.h_view, y.h_view, A.h_view, - h_expected.d_view, x.d_view, y.d_view, A.d_view, + this->populateVariables(alpha, x/*.h_view*/, y/*.h_view*/, A/*.h_view*/, + h_expected.d_view, /*x.d_view, y.d_view, A.d_view,*/ expectedResultIsKnown); // ******************************************************************** @@ -333,7 +335,7 @@ void GerTestercallKkGerAndCompareAgainstExpected( - alpha, x.d_view, y.d_view, A.d_view, A.h_view, h_expected.d_view, + alpha, x.d_view, y.d_view, /*A.d_view, A.h_view*/A, h_expected.d_view, "non const {x,y}"); } @@ -344,7 +346,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, - A.d_view, A.h_view, + /*A.d_view, A.h_view*/A, h_expected.d_view, "const x"); } @@ -355,7 +357,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, - A.d_view, A.h_view, + /*A.d_view, A.h_view*/A, h_expected.d_view, "const y"); } @@ -366,7 +368,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, - y.d_view_const, A.d_view, A.h_view, + y.d_view_const, /*A.d_view, A.h_view*/A, h_expected.d_view, "const {x,y}"); } @@ -389,51 +391,53 @@ void GerTester void GerTester::populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, + Device>::populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>/*_HostViewTypeX& h_x*/& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + /*_HostViewTypeY& h_y,*/ + /*_HostViewTypeA& h_A,*/ _ViewTypeExpected& h_expected, - _ViewTypeX& x, _ViewTypeY& y, - _ViewTypeA& A, + /*_ViewTypeX& x, _ViewTypeY& y, + _ViewTypeA& A,*/ bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, h_x, h_y, h_A, h_expected); - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + this->populateAnalyticalValues(alpha, /*h_x*/x.h_view, /*h_y*/y.h_view, /*h_A*/A.h_view, h_expected); + Kokkos::deep_copy(/*x, h_x*/x.d_view,x.h_view); + Kokkos::deep_copy(/*y, h_y*/y.d_view,y.h_view); + Kokkos::deep_copy(/*A, h_A*/A.d_view,A.h_view); expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 1)) { alpha = 3; - h_x[0] = 2; + /*h_x*/x.h_view[0] = 2; - h_y[0] = 3; + /*h_y*/y.h_view[0] = 3; - h_A(0, 0) = 7; + /*h_A*/A.h_view(0, 0) = 7; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(/*x, h_x*/x.d_view,x.h_view); + Kokkos::deep_copy(/*y, h_y*/y.d_view,y.h_view); + Kokkos::deep_copy(/*A, h_A*/A.d_view,A.h_view); h_expected(0, 0) = 25; expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 2)) { alpha = 3; - h_x[0] = 2; + /*h_x*/x.h_view[0] = 2; - h_y[0] = 3; - h_y[1] = 4; + /*h_y*/y.h_view[0] = 3; + /*h_y*/y.h_view[1] = 4; - h_A(0, 0) = 7; - h_A(0, 1) = -6; + /*h_A*/A.h_view(0, 0) = 7; + /*h_A*/A.h_view(0, 1) = -6; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(/*x, h_x*/x.d_view,x.h_view); + Kokkos::deep_copy(/*y, h_y*/y.d_view,y.h_view); + Kokkos::deep_copy(/*A, h_A*/A.d_view,A.h_view); h_expected(0, 0) = 25; h_expected(0, 1) = 18; @@ -441,20 +445,20 @@ void GerTester void GerTester:: callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - _ViewTypeA& A, const _HostViewTypeA& h_A, + view_stride_adapter<_ViewTypeA, false>& A, + /*_ViewTypeA& A, const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -1383,7 +1388,7 @@ void GerTestercompareKkGerAgainstExpected(alpha, h_A, h_expected); + this->compareKkGerAgainstExpected(alpha, /*h_A*/A.h_view, h_expected); } } From d61e64ec6536056a281a75ae3878ea56c7ff70cd Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Fri, 15 Dec 2023 22:42:06 -0700 Subject: [PATCH 127/326] Backup --- blas/unit_test/Test_Blas2_ger.hpp | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index deaffc2bbd..ee824698b5 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -404,9 +404,9 @@ void GerTesterpopulateAnalyticalValues(alpha, /*h_x*/x.h_view, /*h_y*/y.h_view, /*h_A*/A.h_view, h_expected); - Kokkos::deep_copy(/*x, h_x*/x.d_view,x.h_view); - Kokkos::deep_copy(/*y, h_y*/y.d_view,y.h_view); - Kokkos::deep_copy(/*A, h_A*/A.d_view,A.h_view); + Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); + Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 1)) { @@ -418,9 +418,9 @@ void GerTestercompareKkGerAgainstExpected(alpha, /*h_A*/A.h_view, h_expected); } From fab15a48569abc5bc4332cdd1f003e51b79c524a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 02:27:32 -0700 Subject: [PATCH 128/326] Backup --- .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 10 +-- blas/unit_test/Test_Blas2_ger.hpp | 66 +++++++++---------- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index d05b09784e..42cc7f9b25 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -196,8 +196,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: cublasZgerc() requires LayoutLeft views."); \ + /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger( \ + space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ @@ -266,8 +267,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: cublasCgerc() requires LayoutLeft views."); \ + /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger( \ + space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index ee824698b5..7cbf64ad21 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -224,7 +224,7 @@ void GerTester h_vanilla( "vanilla = A + alpha * x * y^{t,h}", _M, _N); -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", @@ -380,7 +380,7 @@ void GerTester:: } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() @@ -802,7 +802,7 @@ GerTester:: } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() @@ -838,7 +838,7 @@ GerTester:: int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -869,7 +869,7 @@ GerTester:: int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -884,7 +884,7 @@ GerTester:: for (int j(0); j < _N; ++j) { if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() @@ -897,7 +897,7 @@ GerTester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() @@ -977,7 +977,7 @@ GerTester:: } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) @@ -1011,7 +1011,7 @@ GerTester:: int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1024,7 +1024,7 @@ GerTester:: for (int j(0); j < _N; ++j) { if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; @@ -1096,7 +1096,7 @@ GerTester:: } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() @@ -1129,7 +1129,7 @@ GerTester:: } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() @@ -1140,7 +1140,7 @@ GerTester:: } } // for j } // for i -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll @@ -1218,7 +1218,7 @@ GerTester:: int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1248,7 +1248,7 @@ GerTester:: int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1302,7 +1302,7 @@ GerTester:: } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_A(i,j) = " << h_A(i, j) @@ -1312,7 +1312,7 @@ GerTester:: } } // for j } // for i -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() @@ -1353,7 +1353,7 @@ GerTester:: int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1371,7 +1371,7 @@ void GerTester -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG int test_ger(const std::string& caseName) { #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -1460,7 +1460,7 @@ int test_ger(const std::string& /*caseName*/) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1505,7 +1505,7 @@ int test_ger(const std::string& /*caseName*/) { } } -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", caseName.c_str()); @@ -1527,7 +1527,7 @@ int test_ger(const std::string& /*caseName*/) { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1572,7 +1572,7 @@ int test_ger(const std::string& /*caseName*/) { } } -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); @@ -1593,7 +1593,7 @@ int test_ger(const std::string& /*caseName*/) { #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1635,7 +1635,7 @@ int test_ger(const std::string& /*caseName*/) { } } -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); @@ -1656,7 +1656,7 @@ int test_ger(const std::string& /*caseName*/) { #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1693,7 +1693,7 @@ int test_ger(const std::string& /*caseName*/) { tester.test(1024, 1024, 0); } -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); @@ -1712,7 +1712,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #endif -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if 1 // def HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); #else From 9d45602214576eaa94b2324f37b713b43257d21a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 02:51:14 -0700 Subject: [PATCH 129/326] Backup --- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 10 +- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 10 +- blas/unit_test/Test_Blas2_ger.hpp | 163 +++++++++--------- 3 files changed, 89 insertions(+), 94 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index 3ba437a5a7..10275b007f 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -183,8 +183,9 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ } else { \ - throw std::runtime_error( \ - "Error: blasZgerc() requires LayoutLeft views."); \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger( \ + space, trans, alpha, X, Y, A); \ } \ } \ Kokkos::Profiling::popRegion(); \ @@ -252,8 +253,9 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ } else { \ - throw std::runtime_error( \ - "Error: blasCgerc() requires LayoutLeft views."); \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger( \ + space, trans, alpha, X, Y, A); \ } \ } \ Kokkos::Profiling::popRegion(); \ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index c55d091516..cd295cec8b 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -199,8 +199,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: rocblasZgerc() requires LayoutLeft views."); \ + /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger( \ + space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ @@ -273,8 +274,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: rocblasCgec() requires LayoutLeft views."); \ + /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger( \ + space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 7cbf64ad21..ca15ad750f 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -79,11 +79,9 @@ class GerTester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>/*_HostViewTypeX& h_x*/& x, + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, - /*_HostViewTypeY& h_y, _HostViewTypeA& h_A,*/ - _ViewTypeExpected& h_expected, /*_ViewTypeX& x,*/ - /*_ViewTypeY& y, _ViewTypeA& A,*/ + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template @@ -152,8 +150,6 @@ class GerTester { template void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, - /*_ViewTypeA& A,*/ - /*const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation); @@ -224,7 +220,7 @@ void GerTesterpopulateVariables(alpha, x/*.h_view*/, y/*.h_view*/, A/*.h_view*/, - h_expected.d_view, /*x.d_view, y.d_view, A.d_view,*/ + this->populateVariables(alpha, x, y, A, + h_expected.d_view, expectedResultIsKnown); // ******************************************************************** @@ -297,7 +293,7 @@ void GerTester h_vanilla( "vanilla = A + alpha * x * y^{t,h}", _M, _N); -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", @@ -335,7 +331,7 @@ void GerTestercallKkGerAndCompareAgainstExpected( - alpha, x.d_view, y.d_view, /*A.d_view, A.h_view*/A, h_expected.d_view, + alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const {x,y}"); } @@ -346,7 +342,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, - /*A.d_view, A.h_view*/A, + A, h_expected.d_view, "const x"); } @@ -357,7 +353,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, - /*A.d_view, A.h_view*/A, + A, h_expected.d_view, "const y"); } @@ -368,7 +364,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, - y.d_view_const, /*A.d_view, A.h_view*/A, + y.d_view_const, A, h_expected.d_view, "const {x,y}"); } @@ -380,7 +376,7 @@ void GerTester void GerTester::populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>/*_HostViewTypeX& h_x*/& x, + Device>::populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, - /*_HostViewTypeY& h_y,*/ - /*_HostViewTypeA& h_A,*/ _ViewTypeExpected& h_expected, - /*_ViewTypeX& x, _ViewTypeY& y, - _ViewTypeA& A,*/ bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, /*h_x*/x.h_view, /*h_y*/y.h_view, /*h_A*/A.h_view, h_expected); - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(y.d_base,y.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 1)) { alpha = 3; - /*h_x*/x.h_view[0] = 2; + x.h_view[0] = 2; - /*h_y*/y.h_view[0] = 3; + y.h_view[0] = 3; - /*h_A*/A.h_view(0, 0) = 7; + A.h_view(0, 0) = 7; - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(y.d_base,y.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); h_expected(0, 0) = 25; expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 2)) { alpha = 3; - /*h_x*/x.h_view[0] = 2; + x.h_view[0] = 2; - /*h_y*/y.h_view[0] = 3; - /*h_y*/y.h_view[1] = 4; + y.h_view[0] = 3; + y.h_view[1] = 4; - /*h_A*/A.h_view(0, 0) = 7; - /*h_A*/A.h_view(0, 1) = -6; + A.h_view(0, 0) = 7; + A.h_view(0, 1) = -6; - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(y.d_base,y.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); h_expected(0, 0) = 25; h_expected(0, 1) = 18; @@ -445,20 +437,20 @@ void GerTester:: } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() @@ -802,7 +794,7 @@ GerTester:: } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() @@ -838,7 +830,7 @@ GerTester:: int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -869,7 +861,7 @@ GerTester:: int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -884,7 +876,7 @@ GerTester:: for (int j(0); j < _N; ++j) { if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() @@ -897,7 +889,7 @@ GerTester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() @@ -977,7 +969,7 @@ GerTester:: } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) @@ -1011,7 +1003,7 @@ GerTester:: int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1024,7 +1016,7 @@ GerTester:: for (int j(0); j < _N; ++j) { if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; @@ -1096,7 +1088,7 @@ GerTester:: } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() @@ -1129,7 +1121,7 @@ GerTester:: } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() @@ -1140,7 +1132,7 @@ GerTester:: } } // for j } // for i -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll @@ -1218,7 +1210,7 @@ GerTester:: int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1248,7 +1240,7 @@ GerTester:: int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1302,7 +1294,7 @@ GerTester:: } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_A(i,j) = " << h_A(i, j) @@ -1312,7 +1304,7 @@ GerTester:: } } // for j } // for i -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() @@ -1353,7 +1345,7 @@ GerTester:: int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; #endif } @@ -1368,10 +1360,9 @@ void GerTester:: callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, - /*_ViewTypeA& A, const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation) { -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): " @@ -1388,15 +1379,15 @@ void GerTestercompareKkGerAgainstExpected(alpha, /*h_A*/A.h_view, h_expected); + this->compareKkGerAgainstExpected(alpha, A.h_view, h_expected); } } } // namespace Test template -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG int test_ger(const std::string& caseName) { #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -1460,7 +1451,7 @@ int test_ger(const std::string& /*caseName*/) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1505,7 +1496,7 @@ int test_ger(const std::string& /*caseName*/) { } } -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", caseName.c_str()); @@ -1527,7 +1518,7 @@ int test_ger(const std::string& /*caseName*/) { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1572,7 +1563,7 @@ int test_ger(const std::string& /*caseName*/) { } } -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); @@ -1593,7 +1584,7 @@ int test_ger(const std::string& /*caseName*/) { #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1635,7 +1626,7 @@ int test_ger(const std::string& /*caseName*/) { } } -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); @@ -1656,7 +1647,7 @@ int test_ger(const std::string& /*caseName*/) { #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1693,7 +1684,7 @@ int test_ger(const std::string& /*caseName*/) { tester.test(1024, 1024, 0); } -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); @@ -1712,7 +1703,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #endif -#if 1 // def HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); #else From 70b01bf7dfb1f7bf7330835d5e0abad0ca559eb8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 02:55:35 -0700 Subject: [PATCH 130/326] Formatting --- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 8 +- .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 8 +- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 8 +- blas/unit_test/Test_Blas2_ger.hpp | 81 ++++++++++--------- 4 files changed, 53 insertions(+), 52 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index 10275b007f..20e2810cf0 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -184,8 +184,8 @@ namespace Impl { reinterpret_cast*>(A.data()), LDA); \ } else { \ /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger( \ - space, trans, alpha, X, Y, A); \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ Kokkos::Profiling::popRegion(); \ @@ -254,8 +254,8 @@ namespace Impl { reinterpret_cast*>(A.data()), LDA); \ } else { \ /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger( \ - space, trans, alpha, X, Y, A); \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ Kokkos::Profiling::popRegion(); \ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index 42cc7f9b25..3f80144f62 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -197,8 +197,8 @@ namespace Impl { reinterpret_cast(A.data()), LDA)); \ } else { \ /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger( \ - space, trans, alpha, X, Y, A); \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ @@ -268,8 +268,8 @@ namespace Impl { reinterpret_cast(A.data()), LDA)); \ } else { \ /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger( \ - space, trans, alpha, X, Y, A); \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index cd295cec8b..c21b61befa 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -200,8 +200,8 @@ namespace Impl { reinterpret_cast(A.data()), LDA)); \ } else { \ /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ - GER::ger( \ - space, trans, alpha, X, Y, A); \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ @@ -275,8 +275,8 @@ namespace Impl { reinterpret_cast(A.data()), LDA)); \ } else { \ /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ - GER::ger( \ - space, trans, alpha, X, Y, A); \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index ca15ad750f..df3d2cb5d1 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -79,8 +79,10 @@ class GerTester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, + void populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); @@ -148,10 +150,10 @@ class GerTester { T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, - const std::string& situation); + void callKkGerAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -284,8 +286,7 @@ void GerTesterpopulateVariables(alpha, x, y, A, - h_expected.d_view, + this->populateVariables(alpha, x, y, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** @@ -331,8 +332,7 @@ void GerTestercallKkGerAndCompareAgainstExpected( - alpha, x.d_view, y.d_view, A, h_expected.d_view, - "non const {x,y}"); + alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const {x,y}"); } // ******************************************************************** @@ -341,8 +341,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, - A, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A, h_expected.d_view, "const x"); } @@ -352,8 +351,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, - A, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A, h_expected.d_view, "const y"); } @@ -386,19 +384,22 @@ void GerTester -void GerTester::populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { +void GerTester< + ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, + Device>::populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, + bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); - Kokkos::deep_copy(x.d_base,x.h_base); - Kokkos::deep_copy(y.d_base,y.h_base); - Kokkos::deep_copy(A.d_base,A.h_base); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, + h_expected); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 1)) { @@ -410,9 +411,9 @@ void GerTester void GerTester:: - callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, - const std::string& situation) { + callKkGerAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( From 04b0599c698430ce42aa7f03e6c5e6090794e793 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 13:43:02 -0700 Subject: [PATCH 131/326] Correcting compilation error --- blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index 20e2810cf0..84693eb4ac 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -149,8 +149,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ + static void ger(const EXEC_SPACE& space, \ const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ @@ -219,8 +218,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ + static void ger(const EXEC_SPACE&, \ const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ From ceb9a873401122b13f022dcb9e3e7e123390d1f3 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 13:46:57 -0700 Subject: [PATCH 132/326] Typo --- blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index 84693eb4ac..a84c1cd1aa 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -218,7 +218,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE&, \ + static void ger(const EXEC_SPACE& space, \ const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ From 2febc9942a33b543b457fcf248f694878d364b79 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 17:52:09 -0700 Subject: [PATCH 133/326] Changes for syr and syr2, to be tested at weaver --- .../Test_Blas1_axpby_unification.hpp | 130 ++++++++++++++++++ blas/unit_test/Test_Blas2_syr.hpp | 87 ++++++------ blas/unit_test/Test_Blas2_syr2.hpp | 110 ++++++++------- 3 files changed, 236 insertions(+), 91 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 9709d580b3..ae76a52094 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -98,18 +98,21 @@ void impl_test_axpby_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } + std::cout << "kdc a-001" << std::endl; Kokkos::deep_copy(x.h_view, x.d_view); { ScalarTypeY randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); if (testWithNanY) { + std::cout << "kdc a-002" << std::endl; Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); } else { Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } } tY org_y("Org_Y", N); + std::cout << "kdc a-003" << std::endl; Kokkos::deep_copy(org_y.h_view, y.d_view); tScalarA valueA(Kokkos::ArithTraits::zero()); @@ -126,11 +129,13 @@ void impl_test_axpby_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); + std::cout << "kdc a-004" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else { + std::cout << "kdc a-005" << std::endl; Kokkos::deep_copy(b.h_view, b.d_view); valueB = b.h_view(0); KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); @@ -141,6 +146,7 @@ void impl_test_axpby_unification_compare( valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); + std::cout << "kdc a-006" << std::endl; Kokkos::deep_copy(h_a, a); valueA = h_a(); } @@ -153,16 +159,19 @@ void impl_test_axpby_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); + std::cout << "kdc a-007" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else { + std::cout << "kdc a-008" << std::endl; Kokkos::deep_copy(b.h_view, b.d_view); valueB = b.h_view(0); KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } } else { + std::cout << "kdc a-008" << std::endl; Kokkos::deep_copy(a.h_view, a.d_view); valueA = a.h_view(0); if constexpr (std::is_same_v) { @@ -174,17 +183,20 @@ void impl_test_axpby_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); + std::cout << "kdc a-009" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); } else { + std::cout << "kdc a-010" << std::endl; Kokkos::deep_copy(b.h_view, b.d_view); valueB = b.h_view(0); KokkosBlas::axpby(a.d_view, x.d_view, b.d_view, y.d_view); } } + std::cout << "kdc a-011" << std::endl; Kokkos::deep_copy(y.h_view, y.d_view); if (testWithNanY == false) { @@ -248,24 +260,28 @@ void impl_test_axpby_mv_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } + std::cout << "kdc b-001" << std::endl; Kokkos::deep_copy(x.h_view, x.d_view); { ScalarTypeY randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); if (testWithNanY) { + std::cout << "kdc b-002" << std::endl; Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); } else { Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } } tY org_y("Org_Y", N, K); + std::cout << "kdc b-003" << std::endl; Kokkos::deep_copy(org_y.h_view, y.d_view); // Cannot use "if constexpr (isRank1()) {" because rank-1 variables // are passed to current routine with view_stride_adapter<...> bool constexpr aIsRank1 = !std::is_same_v && !isRank0(); if constexpr (aIsRank1) { + std::cout << "kdc b-004" << std::endl; Kokkos::deep_copy(a.h_view, a.d_view); } @@ -273,6 +289,7 @@ void impl_test_axpby_mv_unification_compare( // are passed to current routine with view_stride_adapter<...> bool constexpr bIsRank1 = !std::is_same_v && !isRank0(); if constexpr (bIsRank1) { + std::cout << "kdc b-005" << std::endl; Kokkos::deep_copy(b.h_view, b.d_view); } @@ -289,6 +306,7 @@ void impl_test_axpby_mv_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); + std::cout << "kdc b-006" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } @@ -303,6 +321,7 @@ void impl_test_axpby_mv_unification_compare( valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); + std::cout << "kdc b-007" << std::endl; Kokkos::deep_copy(h_a, a); valueA = h_a(); } @@ -315,6 +334,7 @@ void impl_test_axpby_mv_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); + std::cout << "kdc b-008" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } @@ -334,6 +354,7 @@ void impl_test_axpby_mv_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); + std::cout << "kdc b-009" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } @@ -344,6 +365,7 @@ void impl_test_axpby_mv_unification_compare( } } + std::cout << "kdc b-010" << std::endl; Kokkos::deep_copy(y.h_view, y.d_view); if (testWithNanY == false) { @@ -551,6 +573,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); a = valueA; + std::cout << "kdc u-001" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -582,6 +605,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); a = valueA; + std::cout << "kdc u-002" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -613,6 +637,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); a = valueA; + std::cout << "kdc u-003" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -645,6 +670,7 @@ void impl_test_axpby_unification(int const N) { tScalarB b; view_stride_adapter y("Y", N); + std::cout << "kdc u-004" << std::endl; Kokkos::deep_copy(a, valueA); b = valueB; impl_test_axpby_unification_compare< @@ -680,7 +706,9 @@ void impl_test_axpby_unification(int const N) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N); + std::cout << "kdc u-005" << std::endl; Kokkos::deep_copy(a, valueA); + std::cout << "kdc u-006" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -714,7 +742,9 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); + std::cout << "kdc u-007" << std::endl; Kokkos::deep_copy(a, valueA); + std::cout << "kdc u-008" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -750,7 +780,9 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); + std::cout << "kdc u-009" << std::endl; Kokkos::deep_copy(a, valueA); + std::cout << "kdc u-010" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -782,6 +814,7 @@ void impl_test_axpby_unification(int const N) { tScalarB b; view_stride_adapter y("Y", N); + std::cout << "kdc u-011" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_unification_compare< @@ -817,7 +850,9 @@ void impl_test_axpby_unification(int const N) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N); + std::cout << "kdc u-012" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc u-013" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -850,7 +885,9 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); + std::cout << "kdc u-014" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc u-015" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -883,7 +920,9 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); + std::cout << "kdc u-016" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc u-017" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -915,6 +954,7 @@ void impl_test_axpby_unification(int const N) { tScalarB b; view_stride_adapter y("Y", N); + std::cout << "kdc u-018" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_unification_compare< @@ -950,7 +990,9 @@ void impl_test_axpby_unification(int const N) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N); + std::cout << "kdc u-019" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc u-020" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -983,7 +1025,9 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); + std::cout << "kdc u-021" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc u-022" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -1016,7 +1060,9 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); + std::cout << "kdc u-023" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc u-024" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -1126,6 +1172,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter y("Y", N, K); a = valueA; + std::cout << "kdc mvu-001" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -1157,6 +1204,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter y("Y", N, K); a = valueA; + std::cout << "kdc mvu-002" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -1192,11 +1240,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-003" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-004" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1222,6 +1272,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter y("Y", N, K); a = valueA; + std::cout << "kdc mvu-005" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -1256,11 +1307,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-006" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-007" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1288,6 +1341,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarB b; view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-008" << std::endl; Kokkos::deep_copy(a, valueA); b = valueB; impl_test_axpby_mv_unification_compare< @@ -1323,7 +1377,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-009" << std::endl; Kokkos::deep_copy(a, valueA); + std::cout << "kdc mvu-010" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -1357,7 +1413,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-011" << std::endl; Kokkos::deep_copy(a, valueA); + std::cout << "kdc mvu-012" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -1393,16 +1451,19 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-013" << std::endl; Kokkos::deep_copy(a, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-014" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-015" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1432,7 +1493,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-016" << std::endl; Kokkos::deep_copy(a, valueA); + std::cout << "kdc mvu-017" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -1467,16 +1530,19 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-018" << std::endl; Kokkos::deep_copy(a, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-019" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-020" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1502,6 +1568,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarB b; view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-021" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_mv_unification_compare< @@ -1537,7 +1604,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-022" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc mvu-023" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1570,7 +1639,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-024" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc mvu-025" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1603,16 +1674,19 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-026" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-027" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-028" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1638,7 +1712,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-029" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc mvu-030" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1670,16 +1746,19 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-031" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-032" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-033" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1709,11 +1788,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-034" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-035" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; @@ -1754,13 +1835,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-036" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-037" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } + std::cout << "kdc mvu-038" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1797,13 +1881,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-039" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-040" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } + std::cout << "kdc mvu-041" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1840,11 +1927,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-042" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-043" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } @@ -1852,11 +1941,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-044" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-045" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1886,13 +1977,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-046" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-047" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } + std::cout << "kdc mvu-048" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1928,11 +2022,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-049" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-050" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } @@ -1940,11 +2036,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-051" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-052" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } @@ -1971,6 +2069,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarB b; view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-053" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_mv_unification_compare< @@ -2006,7 +2105,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-054" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc mvu-055" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2039,7 +2140,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-056" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc mvu-057" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2072,16 +2175,19 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-058" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-059" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-060" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -2107,7 +2213,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-061" << std::endl; Kokkos::deep_copy(a.d_base, valueA); + std::cout << "kdc mvu-062" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2139,16 +2247,19 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); + std::cout << "kdc mvu-063" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-064" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-065" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -2178,11 +2289,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-066" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-067" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; @@ -2223,13 +2336,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-068" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-069" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } + std::cout << "kdc mvu-070" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2266,13 +2382,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-071" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-072" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } + std::cout << "kdc mvu-073" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2309,11 +2428,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-074" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-075" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } @@ -2321,11 +2442,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-076" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-077" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } @@ -2356,13 +2479,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-078" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-079" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } + std::cout << "kdc mvu-080" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2398,11 +2524,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { a.h_view[k] = valueA + k; } + std::cout << "kdc mvu-081" << std::endl; Kokkos::deep_copy(a.d_view, a.h_view); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } + std::cout << "kdc mvu-082" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } @@ -2410,11 +2538,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { b.h_view[k] = valueB + k; } + std::cout << "kdc mvu-083" << std::endl; Kokkos::deep_copy(b.d_view, b.h_view); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } + std::cout << "kdc mvu-084" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 6c2651c47e..af1ae871d9 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -76,9 +76,12 @@ class SyrTester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected, - _ViewTypeX& x, _ViewTypeA& A, + void populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeA, false>& A, + /*_HostViewTypeX& h_x, + _HostViewTypeA& h_A,*/ _ViewTypeExpected& h_expected, + /*_ViewTypeX& x, _ViewTypeA& A,*/ bool& expectedResultIsKnown); template @@ -146,8 +149,9 @@ class SyrTester { template void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, - _ViewTypeA& A, - const _HostViewTypeA& h_A, + view_stride_adapter<_ViewTypeA, false>& A, + /*_ViewTypeA& A, + const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation); @@ -283,8 +287,8 @@ void SyrTester::test( // ******************************************************************** // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A // ******************************************************************** - this->populateVariables(alpha, x.h_view, A.h_view, h_expected.d_view, - x.d_view, A.d_view, expectedResultIsKnown); + this->populateVariables(alpha, x/*.h_view*/, A/*.h_view*/, h_expected.d_view, + /*x.d_view, A.d_view,*/ expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla @@ -329,7 +333,7 @@ void SyrTester::test( if (test_x) { this->callKkSyrAndCompareAgainstExpected( - alpha, x.d_view, A.d_view, A.h_view, h_expected.d_view, "non const x"); + alpha, x.d_view, A/*A.d_view, A.h_view*/, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { @@ -344,8 +348,8 @@ void SyrTester::test( if (test_cx) { Kokkos::deep_copy(A.d_base, org_A.d_base); - this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A.d_view, - A.h_view, h_expected.d_view, + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A/*A.d_view, + A.h_view*/, h_expected.d_view, "const x"); } @@ -372,42 +376,44 @@ void SyrTester::test( template void SyrTester::populateVariables( - ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected, _ViewTypeX& x, _ViewTypeA& A, + ScalarA& alpha,/*_HostViewTypeX& h_x, _HostViewTypeA& h_A,*/ + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected,/*_ViewTypeX& x, _ViewTypeA& A,*/ bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, h_x, h_A, h_expected); - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(A, h_A); + this->populateAnalyticalValues(alpha, x.h_view, A.h_view, /*h_x, h_A,*/ h_expected); + Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); expectedResultIsKnown = true; } else if (_N == 1) { alpha = 3; - h_x[0] = 2; + x.h_view/*h_x*/[0] = 2; - h_A(0, 0) = 7; + A.h_view/*h_x*/(0, 0) = 7; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); h_expected(0, 0) = 19; expectedResultIsKnown = true; } else if (_N == 2) { alpha = 3; - h_x[0] = -2; - h_x[1] = 9; + x.h_view/*h_x*/[0] = -2; + x.h_view/*h_x*/[1] = 9; - h_A(0, 0) = 17; - h_A(0, 1) = -43; - h_A(1, 0) = -43; - h_A(1, 1) = 101; + A.h_view/*h_x*/(0, 0) = 17; + A.h_view/*h_x*/(0, 1) = -43; + A.h_view/*h_x*/(1, 0) = -43; + A.h_view/*h_x*/(1, 1) = 101; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); if (_useUpOption) { h_expected(0, 0) = 29; @@ -430,17 +436,17 @@ void SyrTester::populateVariables( { ScalarX randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(/*h_x, x*/x.h_base,x.d_base); + Kokkos::deep_copy(/*h_A, A*/A.h_base,A.d_base); if (_useHermitianOption && _A_is_complex) { // **************************************************************** @@ -448,12 +454,12 @@ void SyrTester::populateVariables( // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i + 1); j < _N; ++j) { - h_A(i, j) = _KAT_A::conj(h_A(j, i)); + A.h_view/*h_x*/(i, j) = _KAT_A::conj(A.h_view/*h_x*/(j, i)); } } for (int i(0); i < _N; ++i) { - h_A(i, i) = 0.5 * (h_A(i, i) + _KAT_A::conj(h_A(i, i))); + A.h_view/*h_x*/(i, i) = 0.5 * (A.h_view/*h_x*/(i, i) + _KAT_A::conj(A.h_view/*h_x*/(i, i))); } } else { // **************************************************************** @@ -461,18 +467,18 @@ void SyrTester::populateVariables( // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i + 1); j < _N; ++j) { - h_A(i, j) = h_A(j, i); + A.h_view/*h_x*/(i, j) = A.h_view/*h_x*/(j, i); } } } - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); } #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i, j) + std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view/*h_x*/(i, j) << std::endl; } } @@ -1438,7 +1444,8 @@ template void SyrTester:: callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, - _ViewTypeA& A, const _HostViewTypeA& h_A, + view_stride_adapter<_ViewTypeA, false>& A, + /*_ViewTypeA& A, const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -1461,7 +1468,7 @@ void SyrTester:: bool gotStdException(false); bool gotUnknownException(false); try { - KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A); + KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A.d_view); } catch (const std::exception& e) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation @@ -1486,8 +1493,8 @@ void SyrTester:: << "have thrown a std::exception"; if ((gotStdException == false) && (gotUnknownException == false)) { - Kokkos::deep_copy(h_A, A); - this->compareKkSyrAgainstReference(alpha, h_A, h_expected); + Kokkos::deep_copy(/*h_A, A*/A.h_base,A.d_base); + this->compareKkSyrAgainstReference(alpha, A.h_view/*h_A*/, h_expected); } } diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index a3b53129fe..0c8fa41765 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -83,10 +83,13 @@ class Syr2Tester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected, _ViewTypeX& x, - _ViewTypeY& y, _ViewTypeA& A, + void populateVariables(ScalarA& alpha, /*_HostViewTypeX& h_x, + _HostViewTypeY& h_y, _HostViewTypeA& h_A,*/ + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, /*_ViewTypeX& x, + _ViewTypeY& y, _ViewTypeA& A,*/ bool& expectedResultIsKnown); template @@ -154,8 +157,9 @@ class Syr2Tester { template void callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - _ViewTypeA& A, - const _HostViewTypeA& h_A, + view_stride_adapter<_ViewTypeA, false>& A, + /*_ViewTypeA& A, + const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation); @@ -296,8 +300,8 @@ void Syr2TesterpopulateVariables(alpha, x.h_view, y.h_view, A.h_view, - h_expected.d_view, x.d_view, y.d_view, A.d_view, + this->populateVariables(alpha, x, y, A, /*x.h_view, y.h_view, A.h_view, */ + h_expected.d_view, /*x.d_view, y.d_view, A.d_view, */ expectedResultIsKnown); // ******************************************************************** @@ -337,7 +341,7 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, - A.d_view, A.h_view, + A,/*.d_view, A.h_view,*/ h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time @@ -354,7 +358,7 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected( - alpha, x.d_view_const, y.d_view_const, A.d_view, A.h_view, + alpha, x.d_view_const, y.d_view_const, A,/*.d_view, A.h_view,*/ h_expected.d_view, "const x"); } @@ -385,54 +389,57 @@ void Syr2Tester void Syr2Tester::populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + Device>::populateVariables(ScalarA& alpha, /*_HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, + _HostViewTypeA& h_A,*/ + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, - _ViewTypeX& x, _ViewTypeY& y, - _ViewTypeA& A, + /*_ViewTypeX& x, _ViewTypeY& y, + _ViewTypeA& A,*/ bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, h_x, h_y, h_A, h_expected); - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, /*h_x, h_y, h_A,*/ h_expected); + Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); + Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); expectedResultIsKnown = true; } else if (_N == 1) { alpha = 3; - h_x[0] = 2; + /*h_x*/x.h_view[0] = 2; - h_y[0] = 4; + /*h_y*/y.h_view[0] = 4; - h_A(0, 0) = 7; + /*h_A*/A.h_view(0, 0) = 7; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); + Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); h_expected(0, 0) = 55; expectedResultIsKnown = true; } else if (_N == 2) { alpha = 3; - h_x[0] = -2; - h_x[1] = 9; + /*h_x*/x.h_view[0] = -2; + /*h_x*/x.h_view[1] = 9; - h_y[0] = 5; - h_y[1] = -4; + /*h_y*/y.h_view[0] = 5; + /*h_y*/y.h_view[1] = -4; - h_A(0, 0) = 17; - h_A(0, 1) = -43; - h_A(1, 0) = -43; - h_A(1, 1) = 101; + /*h_A*/A.h_view(0, 0) = 17; + /*h_A*/A.h_view(0, 1) = -43; + /*h_A*/A.h_view(1, 0) = -43; + /*h_A*/A.h_view(1, 1) = 101; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); + Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); + Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); if (_useUpOption) { h_expected(0, 0) = -43; @@ -455,24 +462,24 @@ void Syr2TestercompareKkSyr2AgainstReference(alpha, h_A, h_expected); + Kokkos::deep_copy(/*h_A, A*/A.h_base,A.d_base); + this->compareKkSyr2AgainstReference(alpha, /*h_A*/A.h_view, h_expected); } } From 7c51188badd529610e37be836c757385502b14f3 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 17:54:29 -0700 Subject: [PATCH 134/326] Formatting --- blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index a84c1cd1aa..bc1a10f61e 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -149,8 +149,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE& space, \ - const char trans[], \ + static void ger(const EXEC_SPACE& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -218,8 +217,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE& space, \ - const char trans[], \ + static void ger(const EXEC_SPACE& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ From 0c14496a3a30c07f0e14dda15d857dd5b5669c6b Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sat, 16 Dec 2023 23:06:23 -0700 Subject: [PATCH 135/326] Changes for axpby --- .../Test_Blas1_axpby_unification.hpp | 149 ++++++++++-------- 1 file changed, 87 insertions(+), 62 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index ae76a52094..152872bf83 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -27,8 +27,8 @@ // // Choices (01)-(03) are selected in the routines TEST_F() at the very // bottom of the file, when calling: -// - either test_axpby_unificationr<...>(), -// - or test_axpby_mv_unificationr<...>(). +// - either test_axpby_unification<...>(), +// - or test_axpby_mv_unification<...>(). // // Choices (04)-(05) are selected in routines: // - test_axpby_unification<...>(), when calling @@ -99,7 +99,7 @@ void impl_test_axpby_unification_compare( Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } std::cout << "kdc a-001" << std::endl; - Kokkos::deep_copy(x.h_view, x.d_view); + Kokkos::deep_copy(x.h_base, x.d_base); // Aqui { ScalarTypeY randStart, randEnd; @@ -113,7 +113,7 @@ void impl_test_axpby_unification_compare( } tY org_y("Org_Y", N); std::cout << "kdc a-003" << std::endl; - Kokkos::deep_copy(org_y.h_view, y.d_view); + Kokkos::deep_copy(org_y.h_base, y.d_base); // Aqui tScalarA valueA(Kokkos::ArithTraits::zero()); tScalarB valueB(Kokkos::ArithTraits::zero()); @@ -136,7 +136,7 @@ void impl_test_axpby_unification_compare( KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else { std::cout << "kdc a-005" << std::endl; - Kokkos::deep_copy(b.h_view, b.d_view); + Kokkos::deep_copy(b.h_base, b.d_base); // Aqui valueB = b.h_view(0); KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } @@ -166,13 +166,13 @@ void impl_test_axpby_unification_compare( KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else { std::cout << "kdc a-008" << std::endl; - Kokkos::deep_copy(b.h_view, b.d_view); + Kokkos::deep_copy(b.h_base, b.d_base); // Aqui valueB = b.h_view(0); KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } } else { std::cout << "kdc a-008" << std::endl; - Kokkos::deep_copy(a.h_view, a.d_view); + Kokkos::deep_copy(a.h_base, a.d_base); // Aqui valueA = a.h_view(0); if constexpr (std::is_same_v) { valueB = b; @@ -190,14 +190,14 @@ void impl_test_axpby_unification_compare( KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); } else { std::cout << "kdc a-010" << std::endl; - Kokkos::deep_copy(b.h_view, b.d_view); + Kokkos::deep_copy(b.h_base, b.d_base); // Aqui valueB = b.h_view(0); KokkosBlas::axpby(a.d_view, x.d_view, b.d_view, y.d_view); } } std::cout << "kdc a-011" << std::endl; - Kokkos::deep_copy(y.h_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); // Aqui if (testWithNanY == false) { for (int i(0); i < N; ++i) { @@ -238,6 +238,7 @@ void impl_test_axpby_unification_compare( } } +#if 1 // Aqui template void impl_test_axpby_mv_unification_compare( @@ -261,7 +262,7 @@ void impl_test_axpby_mv_unification_compare( Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } std::cout << "kdc b-001" << std::endl; - Kokkos::deep_copy(x.h_view, x.d_view); + Kokkos::deep_copy(x.h_base, x.d_base); // Aqui { ScalarTypeY randStart, randEnd; @@ -275,14 +276,14 @@ void impl_test_axpby_mv_unification_compare( } tY org_y("Org_Y", N, K); std::cout << "kdc b-003" << std::endl; - Kokkos::deep_copy(org_y.h_view, y.d_view); + Kokkos::deep_copy(org_y.h_base, y.d_base); // Aqui // Cannot use "if constexpr (isRank1()) {" because rank-1 variables // are passed to current routine with view_stride_adapter<...> bool constexpr aIsRank1 = !std::is_same_v && !isRank0(); if constexpr (aIsRank1) { std::cout << "kdc b-004" << std::endl; - Kokkos::deep_copy(a.h_view, a.d_view); + Kokkos::deep_copy(a.h_base, a.d_base); // Aqui } // Cannot use "if constexpr (isRank1()) {" because rank-1 variables @@ -290,7 +291,7 @@ void impl_test_axpby_mv_unification_compare( bool constexpr bIsRank1 = !std::is_same_v && !isRank0(); if constexpr (bIsRank1) { std::cout << "kdc b-005" << std::endl; - Kokkos::deep_copy(b.h_view, b.d_view); + Kokkos::deep_copy(b.h_base, b.d_base); // Aqui } tScalarA valueA(Kokkos::ArithTraits::zero()); @@ -366,7 +367,7 @@ void impl_test_axpby_mv_unification_compare( } std::cout << "kdc b-010" << std::endl; - Kokkos::deep_copy(y.h_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); // Aqui if (testWithNanY == false) { for (int i(0); i < N; ++i) { @@ -489,6 +490,7 @@ void impl_test_axpby_mv_unification_compare( } } } +#endif // Aqui template @@ -1238,10 +1241,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-003" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -1305,10 +1308,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-006" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -1455,10 +1458,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-014" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -1534,10 +1537,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-019" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -1678,10 +1681,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-027" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -1750,10 +1753,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-032" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -1786,10 +1789,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-034" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -1833,10 +1836,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-036" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -1879,10 +1882,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-039" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -1925,10 +1928,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-042" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -1939,10 +1942,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-044" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -1975,10 +1978,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-046" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2020,10 +2023,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-049" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2034,10 +2037,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-051" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -2179,10 +2182,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-059" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -2251,10 +2254,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-064" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -2287,10 +2290,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-066" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2334,10 +2337,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-068" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2380,10 +2383,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-071" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2426,10 +2429,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-074" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2440,10 +2443,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-076" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -2477,10 +2480,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-078" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2522,10 +2525,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; + a.h_view[k] = valueA + k; // Aqui } std::cout << "kdc mvu-081" << std::endl; - Kokkos::deep_copy(a.d_view, a.h_view); + Kokkos::deep_copy(a.d_base, a.h_base); // Aqui } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; @@ -2536,10 +2539,10 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; + b.h_view[k] = valueB + k; // Aqui } std::cout << "kdc mvu-083" << std::endl; - Kokkos::deep_copy(b.d_view, b.h_view); + Kokkos::deep_copy(b.d_base, b.h_base); // Aqui } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; @@ -2560,15 +2563,18 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // std::cout << "Leaving impl_test_axpby_mv_unification()" << std::endl; // std::cout << "=========================================" << std::endl; } +#endif // Aqui } // namespace Test template int test_axpby_unification() { +#if 1 // Aqui #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + std::cout << "Calling impl_test_axpby_unif(), L-LLL" << std::endl; Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); @@ -2577,13 +2583,16 @@ int test_axpby_unification() { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + std::cout << "Calling impl_test_axpby_unif(), L-RRR" << std::endl; Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>(14); #endif +#endif // Aqui #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + std::cout << "Calling impl_test_axpby_unif(), L-SSS" << std::endl; Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); @@ -2591,18 +2600,22 @@ int test_axpby_unification() { #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + std::cout << "Calling impl_test_axpby_unif(), L-SLL" << std::endl; Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); + std::cout << "Calling impl_test_axpby_unif(), L-LSS" << std::endl; Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); + std::cout << "Calling impl_test_axpby_unif(), L-SRS" << std::endl; Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>(14); + std::cout << "Calling impl_test_axpby_unif(), L-LSR" << std::endl; Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>(14); @@ -2610,6 +2623,7 @@ int test_axpby_unification() { return 1; } +#if 1 // Aqui template int test_axpby_mv_unification() { @@ -2662,6 +2676,7 @@ int test_axpby_mv_unification() { #endif return 1; } +#endif // Aqui #if defined(KOKKOSKERNELS_INST_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2671,12 +2686,14 @@ TEST_F(TestCategory, axpby_unification_float) { test_axpby_unification(); Kokkos::Profiling::popRegion(); } +#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_float"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif +#endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2685,6 +2702,7 @@ TEST_F(TestCategory, axpby_unification_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double"); test_axpby_unification(); } +#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_double"); @@ -2692,6 +2710,7 @@ TEST_F(TestCategory, axpby_mv_unification_double) { Kokkos::Profiling::popRegion(); } #endif +#endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2704,6 +2723,7 @@ TEST_F(TestCategory, axpby_unification_complex_double) { TestDevice>(); Kokkos::Profiling::popRegion(); } +#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_complex_double"); @@ -2713,6 +2733,7 @@ TEST_F(TestCategory, axpby_mv_unification_complex_double) { Kokkos::Profiling::popRegion(); } #endif +#endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2722,12 +2743,14 @@ TEST_F(TestCategory, axpby_unification_int) { test_axpby_unification(); Kokkos::Profiling::popRegion(); } +#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_int"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif +#endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) @@ -2737,6 +2760,7 @@ TEST_F(TestCategory, axpby_unification_double_int) { test_axpby_unification(); Kokkos::Profiling::popRegion(); } +#if 1 // Aqui TEST_F(TestCategory, axpby_double_mv_unification_int) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_double_int"); @@ -2744,3 +2768,4 @@ TEST_F(TestCategory, axpby_double_mv_unification_int) { Kokkos::Profiling::popRegion(); } #endif +#endif From 852afe197c5f029d81d745dc48e25c84786a3ef0 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 17 Dec 2023 00:23:50 -0700 Subject: [PATCH 136/326] Backup --- .../Test_Blas1_axpby_unification.hpp | 494 +++++++++--------- blas/unit_test/Test_Blas2_syr.hpp | 69 ++- blas/unit_test/Test_Blas2_syr2.hpp | 86 ++- 3 files changed, 313 insertions(+), 336 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 152872bf83..3077313697 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -98,22 +98,22 @@ void impl_test_axpby_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc a-001" << std::endl; - Kokkos::deep_copy(x.h_base, x.d_base); // Aqui +#endif + Kokkos::deep_copy(x.h_base, x.d_base); { ScalarTypeY randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); if (testWithNanY) { - std::cout << "kdc a-002" << std::endl; Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); } else { Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } } tY org_y("Org_Y", N); - std::cout << "kdc a-003" << std::endl; - Kokkos::deep_copy(org_y.h_base, y.d_base); // Aqui + Kokkos::deep_copy(org_y.h_base, y.d_base); tScalarA valueA(Kokkos::ArithTraits::zero()); tScalarB valueB(Kokkos::ArithTraits::zero()); @@ -129,14 +129,12 @@ void impl_test_axpby_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); - std::cout << "kdc a-004" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else { - std::cout << "kdc a-005" << std::endl; - Kokkos::deep_copy(b.h_base, b.d_base); // Aqui + Kokkos::deep_copy(b.h_base, b.d_base); valueB = b.h_view(0); KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } @@ -146,7 +144,6 @@ void impl_test_axpby_unification_compare( valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); - std::cout << "kdc a-006" << std::endl; Kokkos::deep_copy(h_a, a); valueA = h_a(); } @@ -159,20 +156,17 @@ void impl_test_axpby_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); - std::cout << "kdc a-007" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else { - std::cout << "kdc a-008" << std::endl; - Kokkos::deep_copy(b.h_base, b.d_base); // Aqui + Kokkos::deep_copy(b.h_base, b.d_base); valueB = b.h_view(0); KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } } else { - std::cout << "kdc a-008" << std::endl; - Kokkos::deep_copy(a.h_base, a.d_base); // Aqui + Kokkos::deep_copy(a.h_base, a.d_base); valueA = a.h_view(0); if constexpr (std::is_same_v) { valueB = b; @@ -183,21 +177,21 @@ void impl_test_axpby_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); - std::cout << "kdc a-009" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); } else { - std::cout << "kdc a-010" << std::endl; - Kokkos::deep_copy(b.h_base, b.d_base); // Aqui + Kokkos::deep_copy(b.h_base, b.d_base); valueB = b.h_view(0); KokkosBlas::axpby(a.d_view, x.d_view, b.d_view, y.d_view); } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc a-011" << std::endl; - Kokkos::deep_copy(y.h_base, y.d_base); // Aqui +#endif + Kokkos::deep_copy(y.h_base, y.d_base); if (testWithNanY == false) { for (int i(0); i < N; ++i) { @@ -238,7 +232,6 @@ void impl_test_axpby_unification_compare( } } -#if 1 // Aqui template void impl_test_axpby_mv_unification_compare( @@ -261,37 +254,35 @@ void impl_test_axpby_mv_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc b-001" << std::endl; - Kokkos::deep_copy(x.h_base, x.d_base); // Aqui +#endif + Kokkos::deep_copy(x.h_base, x.d_base); { ScalarTypeY randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); if (testWithNanY) { - std::cout << "kdc b-002" << std::endl; Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); } else { Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } } tY org_y("Org_Y", N, K); - std::cout << "kdc b-003" << std::endl; - Kokkos::deep_copy(org_y.h_base, y.d_base); // Aqui + Kokkos::deep_copy(org_y.h_base, y.d_base); // Cannot use "if constexpr (isRank1()) {" because rank-1 variables // are passed to current routine with view_stride_adapter<...> bool constexpr aIsRank1 = !std::is_same_v && !isRank0(); if constexpr (aIsRank1) { - std::cout << "kdc b-004" << std::endl; - Kokkos::deep_copy(a.h_base, a.d_base); // Aqui + Kokkos::deep_copy(a.h_base, a.d_base); } // Cannot use "if constexpr (isRank1()) {" because rank-1 variables // are passed to current routine with view_stride_adapter<...> bool constexpr bIsRank1 = !std::is_same_v && !isRank0(); if constexpr (bIsRank1) { - std::cout << "kdc b-005" << std::endl; - Kokkos::deep_copy(b.h_base, b.d_base); // Aqui + Kokkos::deep_copy(b.h_base, b.d_base); } tScalarA valueA(Kokkos::ArithTraits::zero()); @@ -307,7 +298,6 @@ void impl_test_axpby_mv_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); - std::cout << "kdc b-006" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } @@ -322,7 +312,6 @@ void impl_test_axpby_mv_unification_compare( valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); - std::cout << "kdc b-007" << std::endl; Kokkos::deep_copy(h_a, a); valueA = h_a(); } @@ -335,7 +324,6 @@ void impl_test_axpby_mv_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); - std::cout << "kdc b-008" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } @@ -355,7 +343,6 @@ void impl_test_axpby_mv_unification_compare( valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); - std::cout << "kdc b-009" << std::endl; Kokkos::deep_copy(h_b, b); valueB = h_b(); } @@ -366,8 +353,10 @@ void impl_test_axpby_mv_unification_compare( } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc b-010" << std::endl; - Kokkos::deep_copy(y.h_base, y.d_base); // Aqui +#endif + Kokkos::deep_copy(y.h_base, y.d_base); if (testWithNanY == false) { for (int i(0); i < N; ++i) { @@ -490,7 +479,6 @@ void impl_test_axpby_mv_unification_compare( } } } -#endif // Aqui template ) { // Avoid the test, due to compilation errors // ViewTypeBr0 b; @@ -575,7 +565,9 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); a = valueA; +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc u-001" << std::endl; +#endif Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -595,7 +587,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 03/16: Ascalar + Br1s_1 // ************************************************************ - // std::cout << "Starting case 03/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 03/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -607,7 +601,6 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); a = valueA; - std::cout << "kdc u-002" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -627,7 +620,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 04/16: Ascalar + Br1d // ************************************************************ - // std::cout << "Starting case 04/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 04/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -639,7 +634,6 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); a = valueA; - std::cout << "kdc u-003" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -658,7 +652,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 05/16: Ar0 + Bscalar // ************************************************************ - // std::cout << "Starting case 05/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 05/16" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -672,7 +668,6 @@ void impl_test_axpby_unification(int const N) { tScalarB b; view_stride_adapter y("Y", N); - std::cout << "kdc u-004" << std::endl; Kokkos::deep_copy(a, valueA); b = valueB; impl_test_axpby_unification_compare< @@ -693,7 +688,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 06/16: Ar0 + Br0 // ************************************************************ - // std::cout << "Starting case 06/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 06/16" << std::endl; +#endif if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors @@ -708,9 +705,7 @@ void impl_test_axpby_unification(int const N) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N); - std::cout << "kdc u-005" << std::endl; Kokkos::deep_copy(a, valueA); - std::cout << "kdc u-006" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -730,7 +725,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 07/16: Ar0 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 07/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 07/16" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -744,9 +741,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); - std::cout << "kdc u-007" << std::endl; Kokkos::deep_copy(a, valueA); - std::cout << "kdc u-008" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -768,7 +763,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 08/16: Ar0 + Br1d // ************************************************************ - // std::cout << "Starting case 08/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 08/16" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -782,9 +779,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); - std::cout << "kdc u-009" << std::endl; Kokkos::deep_copy(a, valueA); - std::cout << "kdc u-010" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -805,7 +800,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 09/16: Ar1s_1 + Bscalar // ************************************************************ - // std::cout << "Starting case 09/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 09/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -816,7 +813,6 @@ void impl_test_axpby_unification(int const N) { tScalarB b; view_stride_adapter y("Y", N); - std::cout << "kdc u-011" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_unification_compare< @@ -838,7 +834,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 10/16: Ar1s_1 + Br0 // ************************************************************ - // std::cout << "Starting case 10/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 10/16" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -852,9 +850,7 @@ void impl_test_axpby_unification(int const N) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N); - std::cout << "kdc u-012" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc u-013" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -876,7 +872,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 11/16: Ar1s_1 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 11/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 11/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -887,9 +885,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); - std::cout << "kdc u-014" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc u-015" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -911,7 +907,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 12/16: Ar1s_1 + Br1d // ************************************************************ - // std::cout << "Starting case 12/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 12/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -922,9 +920,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); - std::cout << "kdc u-016" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc u-017" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -945,7 +941,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 13/16: Ar1d + Bscalar // ************************************************************ - // std::cout << "Starting case 13/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 13/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -956,7 +954,6 @@ void impl_test_axpby_unification(int const N) { tScalarB b; view_stride_adapter y("Y", N); - std::cout << "kdc u-018" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_unification_compare< @@ -978,7 +975,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 14/16: Ar1d + Br0 // ************************************************************ - // std::cout << "Starting case 14/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 14/16" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -992,9 +991,7 @@ void impl_test_axpby_unification(int const N) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N); - std::cout << "kdc u-019" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc u-020" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -1016,7 +1013,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 15/16: Ar1d + Br1s_1 // ************************************************************ - // std::cout << "Starting case 15/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 15/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1027,9 +1026,7 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); - std::cout << "kdc u-021" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc u-022" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -1051,7 +1048,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 16/16: Ar1d + Br1d // ************************************************************ - // std::cout << "Starting case 16/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 16/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1062,9 +1061,10 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N); - std::cout << "kdc u-023" << std::endl; Kokkos::deep_copy(a.d_base, valueA); +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc u-024" << std::endl; +#endif Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -1083,7 +1083,6 @@ void impl_test_axpby_unification(int const N) { } } -#if 1 // Aqui template @@ -1130,7 +1129,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 01/36: Ascalar + Bscalar // ************************************************************ - // std::cout << "Starting case 01/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 01/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1160,7 +1161,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 02/36: Ascalar + Br0 // ************************************************************ - // std::cout << "Starting case 02/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 02/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1175,7 +1178,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter y("Y", N, K); a = valueA; +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc mvu-001" << std::endl; +#endif Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -1195,7 +1200,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 03/36: Ascalar + Br1s_1 // ************************************************************ - // std::cout << "Starting case 03/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 03/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1207,7 +1214,6 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter y("Y", N, K); a = valueA; - std::cout << "kdc mvu-002" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -1227,7 +1233,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 04/36: Ascalar + Br1s_k // ************************************************************ - // std::cout << "Starting case 04/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 04/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1241,15 +1249,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-003" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-004" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1263,7 +1269,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 05/36: Ascalar + Br1d,1 // ************************************************************ - // std::cout << "Starting case 05/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 05/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1275,7 +1283,6 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter y("Y", N, K); a = valueA; - std::cout << "kdc mvu-005" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -1294,7 +1301,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 06/36: Ascalar + Br1d,k // ************************************************************ - // std::cout << "Starting case 06/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 06/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1308,15 +1317,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-006" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-007" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1330,7 +1337,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 07/36: Ar0 + Bscalar // ************************************************************w - // std::cout << "Starting case 07/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 07/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1344,7 +1353,6 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarB b; view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-008" << std::endl; Kokkos::deep_copy(a, valueA); b = valueB; impl_test_axpby_mv_unification_compare< @@ -1365,7 +1373,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 08/36: Ar0 + Br0 // ************************************************************ - // std::cout << "Starting case 08/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 08/36" << std::endl; +#endif if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors @@ -1380,9 +1390,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-009" << std::endl; Kokkos::deep_copy(a, valueA); - std::cout << "kdc mvu-010" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -1402,7 +1410,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 09/36: Ar0 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 09/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 09/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1416,9 +1426,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-011" << std::endl; Kokkos::deep_copy(a, valueA); - std::cout << "kdc mvu-012" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -1440,7 +1448,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 10/36: Ar0 + Br1s_k // ************************************************************ - // std::cout << "Starting case 10/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 10/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1454,19 +1464,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-013" << std::endl; Kokkos::deep_copy(a, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-014" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-015" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1482,7 +1489,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 11/36: Ar0 + Br1d,1 // ************************************************************ - // std::cout << "Starting case 11/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 11/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1496,9 +1505,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-016" << std::endl; Kokkos::deep_copy(a, valueA); - std::cout << "kdc mvu-017" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, @@ -1519,7 +1526,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 12/36: Ar0 + Br1d,k // ************************************************************ - // std::cout << "Starting case 12/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 12/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1533,19 +1542,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-018" << std::endl; Kokkos::deep_copy(a, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-019" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-020" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1560,7 +1566,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 13/36: Ar1s_1 + Bscalar // ************************************************************w - // std::cout << "Starting case 13/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 13/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1571,7 +1579,6 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarB b; view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-021" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_mv_unification_compare< @@ -1593,7 +1600,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 14/36: Ar1s_1 + Br0 // ************************************************************ - // std::cout << "Starting case 14/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 14/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1607,9 +1616,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-022" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc mvu-023" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1631,7 +1638,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 15/36: Ar1s_1 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 15/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 15/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1642,9 +1651,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-024" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc mvu-025" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1666,7 +1673,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 16/36: Ar1s_1 + Br1s_k // ************************************************************ - // std::cout << "Starting case 16/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 16/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1677,19 +1686,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-026" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-027" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-028" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1704,7 +1710,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 17/36: Ar1s_1 + Br1d,1 // ************************************************************ - // std::cout << "Starting case 17/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 17/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1715,9 +1723,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-029" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc mvu-030" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1738,7 +1744,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 18/36: Ar1s_1 + Br1d,k // ************************************************************ - // std::cout << "Starting case 18/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 18/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1749,19 +1757,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-031" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-032" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-033" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1776,7 +1781,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 19/36: Ar1s_k + Bscalar // ************************************************************ - // std::cout << "Starting case 19/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 19/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1789,15 +1796,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-034" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-035" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; @@ -1820,7 +1825,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 20/36: Ar1s_k + Br0 // ************************************************************ - // std::cout << "Starting case 20/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 20/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -1836,18 +1843,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-036" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-037" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } - std::cout << "kdc mvu-038" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1869,7 +1873,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 21/36: Ar1s_k + Br1s_1 // ************************************************************ - // std::cout << "Starting case 21/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 21/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1882,18 +1888,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-039" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-040" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } - std::cout << "kdc mvu-041" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -1915,7 +1918,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 22/36: Ar1s_k + Br1s_k // ************************************************************ - // std::cout << "Starting case 22/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 22/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1928,29 +1933,25 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-042" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-043" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-044" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-045" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -1965,7 +1966,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 23/36: Ar1s_k + Br1d,1 // ************************************************************ - // std::cout << "Starting case 23/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 23/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -1978,18 +1981,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-046" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-047" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } - std::cout << "kdc mvu-048" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2010,7 +2010,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 24/36: Ar1s_k + Br1d,k // ************************************************************ - // std::cout << "Starting case 24/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 24/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2023,29 +2025,25 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-049" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-050" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-051" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-052" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } @@ -2061,7 +2059,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 25/36: Ar1d,1 + Bscalar // ************************************************************w - // std::cout << "Starting case 25/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 25/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2072,7 +2072,6 @@ void impl_test_axpby_mv_unification(int const N, int const K) { tScalarB b; view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-053" << std::endl; Kokkos::deep_copy(a.d_base, valueA); b = valueB; impl_test_axpby_mv_unification_compare< @@ -2094,7 +2093,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 26/36: Ar1d,1 + Br0 // ************************************************************ - // std::cout << "Starting case 26/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 26/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -2108,9 +2109,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { ViewTypeBr0 b("B"); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-054" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc mvu-055" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2132,7 +2131,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 27/36: Ar1d,1 + Br1s_1 // ************************************************************ - // std::cout << "Starting case 27/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 27/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2143,9 +2144,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-056" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc mvu-057" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2167,7 +2166,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 28/36: Ar1d,1 + Br1s_k // ************************************************************ - // std::cout << "Starting case 28/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 28/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2178,19 +2179,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-058" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-059" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-060" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -2205,7 +2203,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 29/36: Ar1d,1 + Br1d,1 // ************************************************************ - // std::cout << "Starting case 29/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 29/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2216,9 +2216,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", 1); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-061" << std::endl; Kokkos::deep_copy(a.d_base, valueA); - std::cout << "kdc mvu-062" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2239,7 +2237,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 30/36: Ar1d,1 + Br1d,k // ************************************************************ - // std::cout << "Starting case 30/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 30/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2250,19 +2250,16 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter b("B", K); view_stride_adapter y("Y", N, K); - std::cout << "kdc mvu-063" << std::endl; Kokkos::deep_copy(a.d_base, valueA); if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-064" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-065" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< @@ -2277,7 +2274,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 31/36: Ar1d,k + Bscalar // ************************************************************w - // std::cout << "Starting case 31/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 31/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2290,15 +2289,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-066" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-067" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; @@ -2321,7 +2318,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 32/36: Ar1d,k + Br0 // ************************************************************ - // std::cout << "Starting case 32/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 32/36" << std::endl; +#endif if constexpr (std::is_same_v) { // Avoid the test, due to compilation errors } else { @@ -2337,18 +2336,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-068" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-069" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } - std::cout << "kdc mvu-070" << std::endl; Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2370,7 +2366,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 33/36: Ar1d,k + Br1s_1 // ************************************************************ - // std::cout << "Starting case 33/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 33/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2383,18 +2381,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-071" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-072" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } - std::cout << "kdc mvu-073" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2416,7 +2411,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 34/36: Ar1d,k + Br1s_k // ************************************************************ - // std::cout << "Starting case 34/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 34/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2429,29 +2426,25 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-074" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-075" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-076" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } - std::cout << "kdc mvu-077" << std::endl; Kokkos::deep_copy(b.d_base, b.h_base); } @@ -2467,7 +2460,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 35/36: Ar1d,k + Br1d,1 // ************************************************************ - // std::cout << "Starting case 35/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 35/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2480,18 +2475,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-078" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-079" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } - std::cout << "kdc mvu-080" << std::endl; Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< tScalarA, view_stride_adapter, @@ -2512,7 +2504,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // ************************************************************ // Case 36/36: Ar1d,k + Br1d,k // ************************************************************ - // std::cout << "Starting case 36/36" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 36/36" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { @@ -2525,29 +2519,28 @@ void impl_test_axpby_mv_unification(int const N, int const K) { if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - a.h_view[k] = valueA + k; // Aqui + a.h_view[k] = valueA + k; } - std::cout << "kdc mvu-081" << std::endl; - Kokkos::deep_copy(a.d_base, a.h_base); // Aqui + Kokkos::deep_copy(a.d_base, a.h_base); } else { for (int k(0); k < K; ++k) { a.h_base[k] = valueA + k; } - std::cout << "kdc mvu-082" << std::endl; Kokkos::deep_copy(a.d_base, a.h_base); } if constexpr (std::is_same_v) { for (int k(0); k < K; ++k) { - b.h_view[k] = valueB + k; // Aqui + b.h_view[k] = valueB + k; } - std::cout << "kdc mvu-083" << std::endl; - Kokkos::deep_copy(b.d_base, b.h_base); // Aqui + Kokkos::deep_copy(b.d_base, b.h_base); } else { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "kdc mvu-084" << std::endl; +#endif Kokkos::deep_copy(b.d_base, b.h_base); } @@ -2563,18 +2556,18 @@ void impl_test_axpby_mv_unification(int const N, int const K) { // std::cout << "Leaving impl_test_axpby_mv_unification()" << std::endl; // std::cout << "=========================================" << std::endl; } -#endif // Aqui } // namespace Test template int test_axpby_unification() { -#if 1 // Aqui #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LLL" << std::endl; +#endif Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); @@ -2583,16 +2576,19 @@ int test_axpby_unification() { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-RRR" << std::endl; +#endif Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>(14); #endif -#endif // Aqui #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SSS" << std::endl; +#endif Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); @@ -2600,22 +2596,30 @@ int test_axpby_unification() { #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SLL" << std::endl; +#endif Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LSS" << std::endl; +#endif Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SRS" << std::endl; +#endif Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>(14); +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LSR" << std::endl; +#endif Test::impl_test_axpby_unification< tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>(14); @@ -2623,7 +2627,6 @@ int test_axpby_unification() { return 1; } -#if 1 // Aqui template int test_axpby_mv_unification() { @@ -2676,7 +2679,6 @@ int test_axpby_mv_unification() { #endif return 1; } -#endif // Aqui #if defined(KOKKOSKERNELS_INST_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2686,14 +2688,12 @@ TEST_F(TestCategory, axpby_unification_float) { test_axpby_unification(); Kokkos::Profiling::popRegion(); } -#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_float"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif -#endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2702,7 +2702,6 @@ TEST_F(TestCategory, axpby_unification_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double"); test_axpby_unification(); } -#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_double"); @@ -2710,7 +2709,6 @@ TEST_F(TestCategory, axpby_mv_unification_double) { Kokkos::Profiling::popRegion(); } #endif -#endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2723,7 +2721,6 @@ TEST_F(TestCategory, axpby_unification_complex_double) { TestDevice>(); Kokkos::Profiling::popRegion(); } -#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_complex_double"); @@ -2733,7 +2730,6 @@ TEST_F(TestCategory, axpby_mv_unification_complex_double) { Kokkos::Profiling::popRegion(); } #endif -#endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -2743,14 +2739,12 @@ TEST_F(TestCategory, axpby_unification_int) { test_axpby_unification(); Kokkos::Profiling::popRegion(); } -#if 1 // Aqui TEST_F(TestCategory, axpby_mv_unification_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_int"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif -#endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) @@ -2760,7 +2754,6 @@ TEST_F(TestCategory, axpby_unification_double_int) { test_axpby_unification(); Kokkos::Profiling::popRegion(); } -#if 1 // Aqui TEST_F(TestCategory, axpby_double_mv_unification_int) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::axpby_mv_unification_double_int"); @@ -2768,4 +2761,3 @@ TEST_F(TestCategory, axpby_double_mv_unification_int) { Kokkos::Profiling::popRegion(); } #endif -#endif diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index af1ae871d9..28312eee41 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -79,9 +79,7 @@ class SyrTester { void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeA, false>& A, - /*_HostViewTypeX& h_x, - _HostViewTypeA& h_A,*/ _ViewTypeExpected& h_expected, - /*_ViewTypeX& x, _ViewTypeA& A,*/ + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template @@ -150,8 +148,6 @@ class SyrTester { template void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, - /*_ViewTypeA& A, - const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation); @@ -287,8 +283,8 @@ void SyrTester::test( // ******************************************************************** // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A // ******************************************************************** - this->populateVariables(alpha, x/*.h_view*/, A/*.h_view*/, h_expected.d_view, - /*x.d_view, A.d_view,*/ expectedResultIsKnown); + this->populateVariables(alpha, x, A, h_expected.d_view, + expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla @@ -333,7 +329,7 @@ void SyrTester::test( if (test_x) { this->callKkSyrAndCompareAgainstExpected( - alpha, x.d_view, A/*A.d_view, A.h_view*/, h_expected.d_view, "non const x"); + alpha, x.d_view, A, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { @@ -348,8 +344,8 @@ void SyrTester::test( if (test_cx) { Kokkos::deep_copy(A.d_base, org_A.d_base); - this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A/*A.d_view, - A.h_view*/, h_expected.d_view, + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A, + h_expected.d_view, "const x"); } @@ -376,44 +372,44 @@ void SyrTester::test( template void SyrTester::populateVariables( - ScalarA& alpha,/*_HostViewTypeX& h_x, _HostViewTypeA& h_A,*/ + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected,/*_ViewTypeX& x, _ViewTypeA& A,*/ + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, A.h_view, /*h_x, h_A,*/ h_expected); - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + this->populateAnalyticalValues(alpha, x.h_view, A.h_view, h_expected); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); expectedResultIsKnown = true; } else if (_N == 1) { alpha = 3; - x.h_view/*h_x*/[0] = 2; + x.h_view[0] = 2; - A.h_view/*h_x*/(0, 0) = 7; + A.h_view(0, 0) = 7; - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); h_expected(0, 0) = 19; expectedResultIsKnown = true; } else if (_N == 2) { alpha = 3; - x.h_view/*h_x*/[0] = -2; - x.h_view/*h_x*/[1] = 9; + x.h_view[0] = -2; + x.h_view[1] = 9; - A.h_view/*h_x*/(0, 0) = 17; - A.h_view/*h_x*/(0, 1) = -43; - A.h_view/*h_x*/(1, 0) = -43; - A.h_view/*h_x*/(1, 1) = 101; + A.h_view(0, 0) = 17; + A.h_view(0, 1) = -43; + A.h_view(1, 0) = -43; + A.h_view(1, 1) = 101; - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); if (_useUpOption) { h_expected(0, 0) = 29; @@ -445,8 +441,8 @@ void SyrTester::populateVariables( Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(/*h_x, x*/x.h_base,x.d_base); - Kokkos::deep_copy(/*h_A, A*/A.h_base,A.d_base); + Kokkos::deep_copy(x.h_base,x.d_base); + Kokkos::deep_copy(A.h_base,A.d_base); if (_useHermitianOption && _A_is_complex) { // **************************************************************** @@ -454,12 +450,12 @@ void SyrTester::populateVariables( // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i + 1); j < _N; ++j) { - A.h_view/*h_x*/(i, j) = _KAT_A::conj(A.h_view/*h_x*/(j, i)); + A.h_view(i, j) = _KAT_A::conj(A.h_view(j, i)); } } for (int i(0); i < _N; ++i) { - A.h_view/*h_x*/(i, i) = 0.5 * (A.h_view/*h_x*/(i, i) + _KAT_A::conj(A.h_view/*h_x*/(i, i))); + A.h_view(i, i) = 0.5 * (A.h_view(i, i) + _KAT_A::conj(A.h_view(i, i))); } } else { // **************************************************************** @@ -467,18 +463,18 @@ void SyrTester::populateVariables( // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i + 1); j < _N; ++j) { - A.h_view/*h_x*/(i, j) = A.h_view/*h_x*/(j, i); + A.h_view(i, j) = A.h_view(j, i); } } } - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); } #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view/*h_x*/(i, j) + std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view(i, j) << std::endl; } } @@ -1445,7 +1441,6 @@ template void SyrTester:: callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, - /*_ViewTypeA& A, const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -1493,8 +1488,8 @@ void SyrTester:: << "have thrown a std::exception"; if ((gotStdException == false) && (gotUnknownException == false)) { - Kokkos::deep_copy(/*h_A, A*/A.h_base,A.d_base); - this->compareKkSyrAgainstReference(alpha, A.h_view/*h_A*/, h_expected); + Kokkos::deep_copy(A.h_base,A.d_base); + this->compareKkSyrAgainstReference(alpha, A.h_view, h_expected); } } diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 0c8fa41765..e9c39e493b 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -83,13 +83,11 @@ class Syr2Tester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, /*_HostViewTypeX& h_x, - _HostViewTypeY& h_y, _HostViewTypeA& h_A,*/ + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, /*_ViewTypeX& x, - _ViewTypeY& y, _ViewTypeA& A,*/ + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template @@ -158,8 +156,6 @@ class Syr2Tester { template void callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, - /*_ViewTypeA& A, - const _HostViewTypeA& h_A,*/ const _ViewTypeExpected& h_expected, const std::string& situation); @@ -300,8 +296,8 @@ void Syr2TesterpopulateVariables(alpha, x, y, A, /*x.h_view, y.h_view, A.h_view, */ - h_expected.d_view, /*x.d_view, y.d_view, A.d_view, */ + this->populateVariables(alpha, x, y, A, + h_expected.d_view, expectedResultIsKnown); // ******************************************************************** @@ -341,7 +337,7 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, - A,/*.d_view, A.h_view,*/ + A, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time @@ -358,7 +354,7 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected( - alpha, x.d_view_const, y.d_view_const, A,/*.d_view, A.h_view,*/ + alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, "const x"); } @@ -389,57 +385,53 @@ void Syr2Tester void Syr2Tester::populateVariables(ScalarA& alpha, /*_HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A,*/ + Device>::populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, - /*_ViewTypeX& x, _ViewTypeY& y, - _ViewTypeA& A,*/ bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, /*h_x, h_y, h_A,*/ h_expected); - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(y.d_base,y.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); expectedResultIsKnown = true; } else if (_N == 1) { alpha = 3; - /*h_x*/x.h_view[0] = 2; + x.h_view[0] = 2; - /*h_y*/y.h_view[0] = 4; + y.h_view[0] = 4; - /*h_A*/A.h_view(0, 0) = 7; + A.h_view(0, 0) = 7; - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(y.d_base,y.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); h_expected(0, 0) = 55; expectedResultIsKnown = true; } else if (_N == 2) { alpha = 3; - /*h_x*/x.h_view[0] = -2; - /*h_x*/x.h_view[1] = 9; + x.h_view[0] = -2; + x.h_view[1] = 9; - /*h_y*/y.h_view[0] = 5; - /*h_y*/y.h_view[1] = -4; + y.h_view[0] = 5; + y.h_view[1] = -4; - /*h_A*/A.h_view(0, 0) = 17; - /*h_A*/A.h_view(0, 1) = -43; - /*h_A*/A.h_view(1, 0) = -43; - /*h_A*/A.h_view(1, 1) = 101; + A.h_view(0, 0) = 17; + A.h_view(0, 1) = -43; + A.h_view(1, 0) = -43; + A.h_view(1, 1) = 101; - Kokkos::deep_copy(/*x, h_x*/x.d_base,x.h_base); - Kokkos::deep_copy(/*y, h_y*/y.d_base,y.h_base); - Kokkos::deep_copy(/*A, h_A*/A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base,x.h_base); + Kokkos::deep_copy(y.d_base,y.h_base); + Kokkos::deep_copy(A.d_base,A.h_base); if (_useUpOption) { h_expected(0, 0) = -43; @@ -477,9 +469,9 @@ void Syr2TestercompareKkSyr2AgainstReference(alpha, /*h_A*/A.h_view, h_expected); + Kokkos::deep_copy(A.h_base,A.d_base); + this->compareKkSyr2AgainstReference(alpha, A.h_view, h_expected); } } From ebee1f1ef52b0da7b747a7d0d0d760eebbe12cd0 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 17 Dec 2023 00:34:21 -0700 Subject: [PATCH 137/326] Formatting --- .../Test_Blas1_axpby_unification.hpp | 4 +- blas/unit_test/Test_Blas2_syr.hpp | 47 ++++++------ blas/unit_test/Test_Blas2_syr2.hpp | 71 +++++++++---------- 3 files changed, 59 insertions(+), 63 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 3077313697..a5b79aa5ad 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -515,7 +515,9 @@ void impl_test_axpby_unification(int const N) { // ************************************************************ // Case 01/16: Ascalar + Bscalar // ************************************************************ - // std::cout << "Starting case 01/16" << std::endl; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 01/16" << std::endl; +#endif for (size_t i(0); i < valuesA.size(); ++i) { tScalarA const valueA(valuesA[i]); for (size_t j(0); j < valuesB.size(); ++j) { diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 28312eee41..1253a8e329 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -146,10 +146,9 @@ class SyrTester { T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, - const std::string& situation); + void callKkSyrAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); template void callKkGerAndCompareKkSyrAgainstIt( @@ -328,8 +327,8 @@ void SyrTester::test( Kokkos::deep_copy(org_A.h_view, A.h_view); if (test_x) { - this->callKkSyrAndCompareAgainstExpected( - alpha, x.d_view, A, h_expected.d_view, "non const x"); + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view, A, + h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { @@ -345,8 +344,7 @@ void SyrTester::test( Kokkos::deep_copy(A.d_base, org_A.d_base); this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A, - h_expected.d_view, - "const x"); + h_expected.d_view, "const x"); } // ******************************************************************** @@ -372,17 +370,15 @@ void SyrTester::test( template void SyrTester::populateVariables( - ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { this->populateAnalyticalValues(alpha, x.h_view, A.h_view, h_expected); - Kokkos::deep_copy(x.d_base,x.h_base); - Kokkos::deep_copy(A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); expectedResultIsKnown = true; } else if (_N == 1) { @@ -392,8 +388,8 @@ void SyrTester::populateVariables( A.h_view(0, 0) = 7; - Kokkos::deep_copy(x.d_base,x.h_base); - Kokkos::deep_copy(A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); h_expected(0, 0) = 19; expectedResultIsKnown = true; @@ -408,8 +404,8 @@ void SyrTester::populateVariables( A.h_view(1, 0) = -43; A.h_view(1, 1) = 101; - Kokkos::deep_copy(x.d_base,x.h_base); - Kokkos::deep_copy(A.d_base,A.h_base); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); if (_useUpOption) { h_expected(0, 0) = 29; @@ -441,8 +437,8 @@ void SyrTester::populateVariables( Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(x.h_base,x.d_base); - Kokkos::deep_copy(A.h_base,A.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(A.h_base, A.d_base); if (_useHermitianOption && _A_is_complex) { // **************************************************************** @@ -467,7 +463,7 @@ void SyrTester::populateVariables( } } } - Kokkos::deep_copy(A.d_base,A.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); } #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -1439,10 +1435,9 @@ template template void SyrTester:: - callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, - const std::string& situation) { + callKkSyrAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; @@ -1488,7 +1483,7 @@ void SyrTester:: << "have thrown a std::exception"; if ((gotStdException == false) && (gotUnknownException == false)) { - Kokkos::deep_copy(A.h_base,A.d_base); + Kokkos::deep_copy(A.h_base, A.d_base); this->compareKkSyrAgainstReference(alpha, A.h_view, h_expected); } } diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index e9c39e493b..0396bd301f 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -154,10 +154,10 @@ class Syr2Tester { T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, - const std::string& situation); + void callKkSyr2AndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); template void callKkGerAndCompareKkSyr2AgainstIt( @@ -296,8 +296,7 @@ void Syr2TesterpopulateVariables(alpha, x, y, A, - h_expected.d_view, + this->populateVariables(alpha, x, y, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** @@ -336,8 +335,7 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, - A, + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time @@ -354,8 +352,7 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected( - alpha, x.d_view_const, y.d_view_const, A, - h_expected.d_view, "const x"); + alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, "const x"); } // ******************************************************************** @@ -384,20 +381,22 @@ void Syr2Tester -void Syr2Tester::populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { +void Syr2Tester< + ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, + Device>::populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, + bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); - Kokkos::deep_copy(x.d_base,x.h_base); - Kokkos::deep_copy(y.d_base,y.h_base); - Kokkos::deep_copy(A.d_base,A.h_base); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, + h_expected); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); expectedResultIsKnown = true; } else if (_N == 1) { @@ -409,9 +408,9 @@ void Syr2Tester void Syr2Tester:: - callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, - const std::string& situation) { + callKkSyr2AndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha << std::endl; @@ -1563,7 +1562,7 @@ void Syr2TestercompareKkSyr2AgainstReference(alpha, A.h_view, h_expected); } } From b500f96edd7fb43936e249a8bd0d0609ffed7102 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 17 Dec 2023 14:10:41 -0700 Subject: [PATCH 138/326] Just to force new checking tests in github --- blas/unit_test/Test_Blas2_syr2.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 0396bd301f..c49eba765b 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -502,7 +502,7 @@ void Syr2Tester< if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view(i, j) + std::cout << "h_origA(" << i << "," << j << ") = " << A.h_view(i, j) << std::endl; } } @@ -825,8 +825,8 @@ Syr2Tester:: if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) + << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) << std::endl; } } @@ -1054,8 +1054,8 @@ Syr2Tester:: if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) + << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) << std::endl; } } @@ -1184,8 +1184,8 @@ Syr2Tester:: if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) - << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) + << ", h_A(" << i << "," << j << ") = " << h_A(i, j) << std::endl; } } @@ -1411,8 +1411,8 @@ Syr2Tester:: if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) - << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) + << ", h_A(" << i << "," << j << ") = " << h_A(i, j) << std::endl; } } From 4e00981cab888ae35fd489d849fae62c98864b85 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 18 Dec 2023 12:45:57 -0700 Subject: [PATCH 139/326] Addressing feedback from Luc. --- .../Test_Blas1_axpby_unification.hpp | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index a5b79aa5ad..6ce7bad0b1 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -98,9 +98,6 @@ void impl_test_axpby_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc a-001" << std::endl; -#endif Kokkos::deep_copy(x.h_base, x.d_base); { @@ -188,9 +185,6 @@ void impl_test_axpby_unification_compare( } } -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc a-011" << std::endl; -#endif Kokkos::deep_copy(y.h_base, y.d_base); if (testWithNanY == false) { @@ -254,9 +248,6 @@ void impl_test_axpby_mv_unification_compare( Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc b-001" << std::endl; -#endif Kokkos::deep_copy(x.h_base, x.d_base); { @@ -353,9 +344,6 @@ void impl_test_axpby_mv_unification_compare( } } -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc b-010" << std::endl; -#endif Kokkos::deep_copy(y.h_base, y.d_base); if (testWithNanY == false) { @@ -567,9 +555,6 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); a = valueA; -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc u-001" << std::endl; -#endif Kokkos::deep_copy(b, valueB); impl_test_axpby_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -1064,9 +1049,6 @@ void impl_test_axpby_unification(int const N) { view_stride_adapter y("Y", N); Kokkos::deep_copy(a.d_base, valueA); -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc u-024" << std::endl; -#endif Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< tScalarA, view_stride_adapter, @@ -1180,9 +1162,6 @@ void impl_test_axpby_mv_unification(int const N, int const K) { view_stride_adapter y("Y", N, K); a = valueA; -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc mvu-001" << std::endl; -#endif Kokkos::deep_copy(b, valueB); impl_test_axpby_mv_unification_compare< tScalarA, tScalarA, view_stride_adapter, tScalarB, @@ -2540,9 +2519,6 @@ void impl_test_axpby_mv_unification(int const N, int const K) { for (int k(0); k < K; ++k) { b.h_base[k] = valueB + k; } -#ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "kdc mvu-084" << std::endl; -#endif Kokkos::deep_copy(b.d_base, b.h_base); } From cb24a0d477a35e20eabc321985e14d720788c120 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 13 Dec 2023 16:01:53 -0700 Subject: [PATCH 140/326] Don't call optimize_gemv for one-shot spmv --- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 6e7aec8a91..10d9f8f2ee 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -736,11 +736,9 @@ struct spmv_onemkl_wrapper { const_cast(A.graph.row_map.data()), const_cast(A.graph.entries.data()), const_cast(A.values.data())); - auto ev_opt = oneapi::mkl::sparse::optimize_gemv( - exec.sycl_queue(), mkl_mode, handle, {ev_set}); auto ev_gemv = oneapi::mkl::sparse::gemv(exec.sycl_queue(), mkl_mode, alpha, handle, - x.data(), beta, y.data(), {ev_opt}); + x.data(), beta, y.data(), {ev_set}); // MKL 2023.2 and up make this release okay async even though it takes a // pointer to a stack variable #if INTEL_MKL_VERSION >= 20230200 @@ -776,13 +774,11 @@ struct spmv_onemkl_wrapper { const_cast(A.graph.entries.data()), reinterpret_cast*>( const_cast(A.values.data()))); - auto ev_opt = oneapi::mkl::sparse::optimize_gemv( - exec.sycl_queue(), mkl_mode, handle, {ev_set}); auto ev_gemv = oneapi::mkl::sparse::gemv( exec.sycl_queue(), mkl_mode, alpha, handle, reinterpret_cast*>( const_cast(x.data())), - beta, reinterpret_cast*>(y.data()), {ev_opt}); + beta, reinterpret_cast*>(y.data()), {ev_set}); // MKL 2023.2 and up make this release okay async even though it takes a // pointer to a stack variable #if INTEL_MKL_VERSION >= 20230200 From 5868e99d1e3342ddbd4fcf09b01eff63beb36de8 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 20 Dec 2023 18:19:57 -0700 Subject: [PATCH 141/326] Add HIPManagedSpace support - CMake option for ETI - Run unit tests with a Kokkos::Device, not just Kokkos::HIP - Like we do for Cuda - Still use HIPSpace unless Managed is the only enabled memspace - Couple of minor fixes - Allow querying free HIPManagedSpace memory for SpGEMM - Disable VBD coloring (not a huge deal, had to do same on CUDA) - Use correct memory space in SpTRSV solve --- cmake/KokkosKernels_config.h.in | 1 + cmake/kokkoskernels_eti_devices.cmake | 13 ++++++++++++- common/src/KokkosKernels_ExecSpaceUtils.hpp | 11 +++++++++++ graph/unit_test/Test_Graph_graph_color.hpp | 19 +++++++++++++++---- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 6 ++---- test_common/Test_HIP.hpp | 13 ++++++++++++- 6 files changed, 53 insertions(+), 10 deletions(-) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 6f5b07f287..ef8fea78b8 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -53,6 +53,7 @@ /* Whether to build kernels for execution space Kokkos::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE /* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index 8c6cb540ae..8bd131f2a4 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -23,6 +23,7 @@ SET(MEM_SPACES MEMSPACE_CUDASPACE MEMSPACE_CUDAUVMSPACE MEMSPACE_HIPSPACE + MEMSPACE_HIPMANAGEDSPACE MEMSPACE_SYCLSPACE MEMSPACE_SYCLSHAREDSPACE MEMSPACE_OPENMPTARGET @@ -32,6 +33,7 @@ SET(MEM_SPACES SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::HIPSpace) +SET(MEMSPACE_HIPMANAGEDSPACE_CPP_TYPE Kokkos::HIPManagedSpace) SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) SET(MEMSPACE_OPENMPTARGETSPACE_CPP_TYPE Kokkos::Experimental::OpenMPTargetSpace) @@ -85,10 +87,19 @@ IF(KOKKOS_ENABLE_HIP) BOOL "Whether to pre instantiate kernels for the memory space Kokkos::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) + KOKKOSKERNELS_ADD_OPTION( + INST_MEMSPACE_HIPMANAGEDSPACE + OFF + BOOL + "Whether to pre instantiate kernels for the memory space Kokkos::HIPManagedSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: OFF." + ) IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) LIST(APPEND DEVICE_LIST "") ENDIF() + IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) ) MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF. Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with HIP and complex double enabled.") @@ -197,7 +208,7 @@ KOKKOSKERNELS_ADD_OPTION( ) SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) -SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE) +SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE HIPMANAGEDSPACE) SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_OPENMPTARGET_VALID_MEM_SPACES OPENMPTARGETSPACE) SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index 2ec09f4069..4d3a3002b4 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -215,10 +215,21 @@ inline void kk_get_free_total_memory(size_t& free_mem, total_mem /= n_streams; } template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); +} +template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); +} #endif // FIXME_SYCL Use compiler extension instead of low level interface when diff --git a/graph/unit_test/Test_Graph_graph_color.hpp b/graph/unit_test/Test_Graph_graph_color.hpp index 5d4eec03ca..101c489bc0 100644 --- a/graph/unit_test/Test_Graph_graph_color.hpp +++ b/graph/unit_test/Test_Graph_graph_color.hpp @@ -110,10 +110,15 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, COLORING_DEFAULT, COLORING_SERIAL, COLORING_VB, COLORING_VBBIT, COLORING_VBCS}; -#ifdef KOKKOS_ENABLE_CUDA + // FIXME: VBD sometimes fails on CUDA and HIP +#if defined(KOKKOS_ENABLE_CUDA) if (!std::is_same::value) { coloring_algorithms.push_back(COLORING_VBD); } +#elif defined(KOKKOS_ENABLE_HIP) + if (!std::is_same::value) { + coloring_algorithms.push_back(COLORING_VBD); + } #else coloring_algorithms.push_back(COLORING_VBD); #endif @@ -174,9 +179,15 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, } } } - EXPECT_TRUE((num_conflict == conf)); - - EXPECT_TRUE((num_conflict == 0)); + EXPECT_TRUE((num_conflict == conf)) + << "Coloring algo " << (int)coloring_algorithm + << ": kk_is_d1_coloring_valid returned incorrect number of conflicts (" + << num_conflict << ", should be " << conf << ")"; + + EXPECT_TRUE((num_conflict == 0)) + << "Coloring algo " << (int)coloring_algorithm + << ": D1 coloring produced invalid coloring (" << num_conflict + << " conflicts)"; } // device::execution_space::finalize(); } diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 6188d001b1..a64a4d23bc 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -664,8 +664,6 @@ struct LowerTriLvlSchedTP2SolverFunctor { // Helper functors for Lower-triangular solve with SpMV template struct SparseTriSupernodalSpMVFunctor { - // using execution_space = typename LHSType::execution_space; - // using memory_space = typename execution_space::memory_space; using execution_space = typename TriSolveHandle::HandleExecSpace; using memory_space = typename TriSolveHandle::HandleTempMemorySpace; @@ -2913,7 +2911,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename ExecutionSpace::memory_space; + using memory_space = typename TriSolveHandle::HandleTempMemorySpace; using device_t = Kokkos::Device; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; @@ -3311,7 +3309,7 @@ void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - using memory_space = typename ExecutionSpace::memory_space; + using memory_space = typename TriSolveHandle::HandleTempMemorySpace; using device_t = Kokkos::Device; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; diff --git a/test_common/Test_HIP.hpp b/test_common/Test_HIP.hpp index c9e02698c5..dfb8e1d687 100644 --- a/test_common/Test_HIP.hpp +++ b/test_common/Test_HIP.hpp @@ -31,7 +31,18 @@ class hip : public ::testing::Test { static void TearDownTestCase() {} }; +using HIPSpaceDevice = Kokkos::Device; +using HIPManagedSpaceDevice = + Kokkos::Device; + #define TestCategory hip -#define TestDevice Kokkos::HIP + +// Prefer for any testing where only one exec space is used +#if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) && \ + !defined(KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) +#define TestDevice HIPManagedSpaceDevice +#else +#define TestDevice HIPSpaceDevice +#endif #endif // TEST_HIP_HPP From 772183b7a96917409888fbfb70cb638620a10af4 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 24 Dec 2023 19:44:35 -0700 Subject: [PATCH 142/326] Backup --- ...Blas1_axpby_unification_attempt_traits.hpp | 34 +++++++++++++------ blas/src/KokkosBlas1_axpby.hpp | 30 +++++++++++----- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 49172a4c10..7e59e52166 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -104,12 +104,17 @@ struct AxpbyUnificationAttemptTraits { // - variable names begin with lower case letters // - type names begin with upper case letters // ******************************************************************** - private: + public: static constexpr bool onDevice = KokkosKernels::Impl::kk_is_gpu_exec_space(); + + private: static constexpr bool onHost = !onDevice; + public: static constexpr bool a_is_scalar = !Kokkos::is_view_v; + + private: static constexpr bool a_is_r0 = Tr0_val(); static constexpr bool a_is_r1s = Tr1s_val(); static constexpr bool a_is_r1d = Tr1d_val(); @@ -117,7 +122,10 @@ struct AxpbyUnificationAttemptTraits { static constexpr bool x_is_r1 = Kokkos::is_view_v && (XMV::rank == 1); static constexpr bool x_is_r2 = Kokkos::is_view_v && (XMV::rank == 2); + public: static constexpr bool b_is_scalar = !Kokkos::is_view_v; + + private: static constexpr bool b_is_r0 = Tr0_val(); static constexpr bool b_is_r1s = Tr1s_val(); static constexpr bool b_is_r1d = Tr1d_val(); @@ -220,10 +228,12 @@ struct AxpbyUnificationAttemptTraits { // 'AtInputScalarTypeA_nonConst' >; - using InternalTypeA_onDevice = + using InternalTypeA_onDevice = std::conditional_t< + a_is_scalar && b_is_scalar && onDevice, // Aqui + InternalScalarTypeA, Kokkos::View>; + Kokkos::MemoryTraits>>; using InternalTypeA_onHost = std::conditional_t< (a_is_r1d || a_is_r1s) && xyRank2Case && onHost, @@ -276,13 +286,15 @@ struct AxpbyUnificationAttemptTraits { // 'AtInputScalarTypeB_nonConst' >; - using InternalTypeB_onDevice = + using InternalTypeB_onDevice = std::conditional_t< + a_is_scalar && b_is_scalar && onDevice, // Aqui + InternalScalarTypeB, Kokkos::View>; + Kokkos::MemoryTraits>>; using InternalTypeB_onHost = std::conditional_t< - ((b_is_r1d || b_is_r1s) && xyRank2Case && onHost), + (b_is_r1d || b_is_r1s) && xyRank2Case && onHost, Kokkos::View>, @@ -614,7 +626,7 @@ struct AxpbyUnificationAttemptTraits { } } else { if constexpr (xyRank1Case) { - constexpr bool internalTypeA_isOk = internalTypeA_is_r1d; + constexpr bool internalTypeA_isOk = internalTypeA_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); // Aqui static_assert( internalTypeA_isOk, "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" @@ -630,7 +642,7 @@ struct AxpbyUnificationAttemptTraits { "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ", onDevice, xyRank1Case: InternalTypeX is wrong"); - constexpr bool internalTypeB_isOk = internalTypeB_is_r1d; + constexpr bool internalTypeB_isOk = internalTypeB_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); // Aqui static_assert( internalTypeB_isOk, "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" @@ -646,7 +658,7 @@ struct AxpbyUnificationAttemptTraits { "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ", onDevice, xyRank1Case: InternalTypeY is wrong"); } else { - constexpr bool internalTypeA_isOk = internalTypeA_is_r1d; + constexpr bool internalTypeA_isOk = internalTypeA_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); // Aqui static_assert( internalTypeA_isOk, "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" @@ -662,7 +674,7 @@ struct AxpbyUnificationAttemptTraits { "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ", onDevice, xyRank2Case: InternalTypeX is wrong"); - constexpr bool internalTypeB_isOk = internalTypeB_is_r1d; + constexpr bool internalTypeB_isOk = internalTypeB_is_r1d || (a_is_scalar && b_is_scalar && internalTypeB_is_scalar); // Aqui static_assert( internalTypeB_isOk, "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" @@ -712,7 +724,7 @@ struct AxpbyUnificationAttemptTraits { // - [InternalTypeA / B] = [view, view] // **************************************************************** static_assert( - internalTypesAB_bothViews, + internalTypesAB_bothViews || (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), // Aqui "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ", onDevice, invalid combination of types"); } diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 0c94c4009c..ed57e220c9 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -17,9 +17,9 @@ #ifndef KOKKOSBLAS1_AXPBY_HPP_ #define KOKKOSBLAS1_AXPBY_HPP_ -#ifdef HAVE_KOKKOSKERNELS_DEBUG +//#ifdef HAVE_KOKKOSKERNELS_DEBUG #include -#endif +//#endif #include #include @@ -84,14 +84,26 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, InternalTypeY internal_Y = Y; if constexpr (AxpbyTraits::internalTypesAB_bothScalars) { - InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost< - AV, Impl::typeRank()>::getValue(a)); - InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost< - BV, Impl::typeRank()>::getValue(b)); + if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && AxpbyTraits::onDevice) { // Aqui + // Special case: 'a' and 'b' are kept as scalar, eventually changing precision to match the precisions of 'X' and 'Y' + std::cout << "Passing in axpby special case" << std::endl; + InternalTypeA internal_a(a); + InternalTypeA internal_b(b); - Impl::Axpby::axpby(exec_space, internal_a, internal_X, - internal_b, internal_Y); + Impl::Axpby::axpby(exec_space, internal_a, internal_X, + internal_b, internal_Y); + } + else { + InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost< + AV, Impl::typeRank()>::getValue(a)); + InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost< + BV, Impl::typeRank()>::getValue(b)); + + Impl::Axpby::axpby(exec_space, internal_a, internal_X, + internal_b, internal_Y); + } } else if constexpr (AxpbyTraits::internalTypesAB_bothViews) { constexpr bool internalLayoutA_isStride( std::is_same_v Date: Sun, 24 Dec 2023 22:44:12 -0700 Subject: [PATCH 143/326] Backup --- ...Blas1_axpby_unification_attempt_traits.hpp | 24 ++++++++++--------- blas/src/KokkosBlas1_axpby.hpp | 8 ++++--- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 7e59e52166..426091be0f 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -229,7 +229,7 @@ struct AxpbyUnificationAttemptTraits { >; using InternalTypeA_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Aqui + a_is_scalar && b_is_scalar && onDevice, // Keep 'a' as scalar InternalScalarTypeA, Kokkos::View; using InternalTypeB_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Aqui + a_is_scalar && b_is_scalar && onDevice, // Keep 'b' as scalar InternalScalarTypeB, Kokkos::View, view] + // 1) xyRank1Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] // // or // - // 2) xyRank2Case, with only one possible situation: - // - [InternalTypeA / B] = [view, view] + // 2) xyRank2Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] // **************************************************************** static_assert( - internalTypesAB_bothViews || (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), // Aqui + internalTypesAB_bothViews || (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ", onDevice, invalid combination of types"); } diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index ed57e220c9..cb9f5d8ae9 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -84,9 +84,11 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, InternalTypeY internal_Y = Y; if constexpr (AxpbyTraits::internalTypesAB_bothScalars) { - if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && AxpbyTraits::onDevice) { // Aqui - // Special case: 'a' and 'b' are kept as scalar, eventually changing precision to match the precisions of 'X' and 'Y' - std::cout << "Passing in axpby special case" << std::endl; + if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && AxpbyTraits::onDevice) { + // ****************************************************************** + // In this special case, 'a' and 'b' are kept as scalar, evantually + // changing precision to match the precisions of 'X' and 'Y' + // ****************************************************************** InternalTypeA internal_a(a); InternalTypeA internal_b(b); From 11d369b8137701b7c3218f2334262e032f9f436d Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 24 Dec 2023 23:25:37 -0700 Subject: [PATCH 144/326] Backup --- ...Blas1_axpby_unification_attempt_traits.hpp | 35 ++++++++++++------- blas/src/KokkosBlas1_axpby.hpp | 23 +++++++++--- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 426091be0f..1a66637b13 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -115,9 +115,9 @@ struct AxpbyUnificationAttemptTraits { static constexpr bool a_is_scalar = !Kokkos::is_view_v; private: - static constexpr bool a_is_r0 = Tr0_val(); - static constexpr bool a_is_r1s = Tr1s_val(); - static constexpr bool a_is_r1d = Tr1d_val(); + static constexpr bool a_is_r0 = Tr0_val(); + static constexpr bool a_is_r1s = Tr1s_val(); + static constexpr bool a_is_r1d = Tr1d_val(); static constexpr bool x_is_r1 = Kokkos::is_view_v && (XMV::rank == 1); static constexpr bool x_is_r2 = Kokkos::is_view_v && (XMV::rank == 2); @@ -126,9 +126,9 @@ struct AxpbyUnificationAttemptTraits { static constexpr bool b_is_scalar = !Kokkos::is_view_v; private: - static constexpr bool b_is_r0 = Tr0_val(); - static constexpr bool b_is_r1s = Tr1s_val(); - static constexpr bool b_is_r1d = Tr1d_val(); + static constexpr bool b_is_r0 = Tr0_val(); + static constexpr bool b_is_r1s = Tr1s_val(); + static constexpr bool b_is_r1d = Tr1d_val(); static constexpr bool y_is_r1 = Kokkos::is_view_v && (YMV::rank == 1); static constexpr bool y_is_r2 = Kokkos::is_view_v && (YMV::rank == 2); @@ -229,7 +229,7 @@ struct AxpbyUnificationAttemptTraits { >; using InternalTypeA_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Keep 'a' as scalar + a_is_scalar && b_is_scalar && onDevice, // Keep 'a' as scalar InternalScalarTypeA, Kokkos::View; using InternalTypeB_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Keep 'b' as scalar + a_is_scalar && b_is_scalar && onDevice, // Keep 'b' as scalar InternalScalarTypeB, Kokkos::View, view] // **************************************************************** static_assert( - internalTypesAB_bothViews || (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), + internalTypesAB_bothViews || + (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ", onDevice, invalid combination of types"); } diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index cb9f5d8ae9..b2942d69af 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -84,10 +84,21 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, InternalTypeY internal_Y = Y; if constexpr (AxpbyTraits::internalTypesAB_bothScalars) { - if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && AxpbyTraits::onDevice) { + // ******************************************************************** + // The unification logic applies the following general rules: + // 1) In a 'onHost' case, it makes the internal types for 'a' and 'b' + // to be both scalars (hence the name 'internalTypesAB_bothScalars') + // 2) In a 'onDevice' case, it makes the internal types for 'a' and 'b' + // to be Kokkos views. For performance reasons in Trilinos, the only + // exception for this rule is when the input types for both 'a' and + // 'b' are already scalars, in which case the internal types for 'a' + // and 'b' become scalars as well, eventually changing precision in + // order to match the precisions of 'X' and 'Y'. + // ******************************************************************** + if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && + AxpbyTraits::onDevice) { // ****************************************************************** - // In this special case, 'a' and 'b' are kept as scalar, evantually - // changing precision to match the precisions of 'X' and 'Y' + // We are in the exception situation for rule 2 // ****************************************************************** InternalTypeA internal_a(a); InternalTypeA internal_b(b); @@ -95,8 +106,10 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, Impl::Axpby::axpby(exec_space, internal_a, internal_X, internal_b, internal_Y); - } - else { + } else { + // ****************************************************************** + // We are in rule 1, that is, we are in a 'onHost' case now + // ****************************************************************** InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost< AV, Impl::typeRank()>::getValue(a)); InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost< From c573d6e3ff46f2d57f6064cb22c03265e409ad1b Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Sun, 24 Dec 2023 23:39:21 -0700 Subject: [PATCH 145/326] Minor typo --- blas/src/KokkosBlas1_axpby.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index b2942d69af..55da4f437a 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -17,9 +17,9 @@ #ifndef KOKKOSBLAS1_AXPBY_HPP_ #define KOKKOSBLAS1_AXPBY_HPP_ -//#ifdef HAVE_KOKKOSKERNELS_DEBUG +#ifdef HAVE_KOKKOSKERNELS_DEBUG #include -//#endif +#endif #include #include From 66f60e982e8e58ec89913b59304b226be177cfb9 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 11 Jan 2024 09:24:50 -0700 Subject: [PATCH 146/326] Add block support to all SPILUK algorithms (#2064) * Interface for block iluk * Progress. Test hooked up * Progress on test refactoring * More test reorg * Fix test * Refactor spiluk numeric a bit with a struct wrapper * Add good logging * progress * Fix block test * Progress but potential dead end * Giving up on this approach for now * progress * Make verbose * Progress * Progress * RP working? * Progress on TP alg * Bug fix * Progress on template stuff * Progress on block TP * Progress * Get rid of all the static_casts * More cleanup. Steams now support blocks * Tests not passing * Serail tests all working, both algs, blocked * Remove output coming from spiluk test * Final fixes for CPU * Cuda req full template specification for SerialGemm::invoke * Don't use scratch for now * Formatting * Fix warnings * Formatting * Add tolerance to view checks. Use macro and remove redundant test util * Fix for HIP * formatting * Another test reorg to fix weirdness on solo * formatting * Remove unused var * Github feedback * Remove test cout * formatting * Zero-size arrays can cause problems * Fix unused var warning --- .../impl/KokkosBatched_Trsm_Serial_Impl.hpp | 26 + .../impl/KokkosBatched_Trsm_Team_Impl.hpp | 30 + .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 1439 ++++++++++------- .../impl/KokkosSparse_spiluk_numeric_spec.hpp | 16 +- .../KokkosSparse_spiluk_symbolic_impl.hpp | 6 - sparse/src/KokkosKernels_Handle.hpp | 8 +- sparse/src/KokkosSparse_spiluk.hpp | 1 - sparse/src/KokkosSparse_spiluk_handle.hpp | 78 +- sparse/unit_test/Test_Sparse_par_ilut.hpp | 108 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 856 +++++----- sparse/unit_test/Test_vector_fixtures.hpp | 194 +++ 11 files changed, 1645 insertions(+), 1117 deletions(-) create mode 100644 sparse/unit_test/Test_vector_fixtures.hpp diff --git a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp index 268df195ce..4d094c24d2 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp @@ -176,6 +176,32 @@ struct SerialTrsm +struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke( + ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + } +}; + +template +struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke( + ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + } +}; + /// /// L/U/NT /// diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index 41fe47a35e..a7430775ea 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -99,6 +99,36 @@ struct TeamTrsm +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftLower::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + +template +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftLower::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + /// /// L/U/NT /// diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index c2863885b2..9484a02c11 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -21,8 +21,17 @@ /// \brief Implementation(s) of the numeric phase of sparse ILU(k). #include +#include #include #include +#include "KokkosBatched_SetIdentity_Decl.hpp" +#include "KokkosBatched_SetIdentity_Impl.hpp" +#include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Trsm_Serial_Impl.hpp" +#include "KokkosBatched_Axpy.hpp" +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +#include "KokkosBlas1_set.hpp" //#define NUMERIC_OUTPUT_INFO @@ -30,635 +39,891 @@ namespace KokkosSparse { namespace Impl { namespace Experimental { -// struct UnsortedTag {}; - -template -struct ILUKLvlSchedRPNumericFunctor { - using lno_t = typename AEntriesType::non_const_value_type; - using scalar_t = typename AValuesType::non_const_value_type; - ARowMapType A_row_map; - AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; - LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; - UEntriesType U_entries; - UValuesType U_values; - LevelViewType level_idx; - WorkViewType iw; - nnz_lno_t lev_start; - - ILUKLvlSchedRPNumericFunctor( - const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const nnz_lno_t &lev_start_) - : A_row_map(A_row_map_), - A_entries(A_entries_), - A_values(A_values_), - L_row_map(L_row_map_), - L_entries(L_entries_), - L_values(L_values_), - U_row_map(U_row_map_), - U_entries(U_entries_), - U_values(U_values_), - level_idx(level_idx_), - iw(iw_), - lev_start(lev_start_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = level_idx(i); - auto tid = i - lev_start; - auto k1 = L_row_map(rowid); - auto k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) { -#else - for (auto k = k1; k < k2; ++k) { -#endif - auto col = L_entries(k); - L_values(k) = 0.0; - iw(tid, col) = k; +template +struct IlukWrap { + // + // Useful types + // + using execution_space = typename IlukHandle::execution_space; + using memory_space = typename IlukHandle::memory_space; + using lno_t = typename IlukHandle::nnz_lno_t; + using size_type = typename IlukHandle::size_type; + using scalar_t = typename IlukHandle::nnz_scalar_t; + using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t; + using HandleDeviceValueType = typename IlukHandle::nnz_value_view_t; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + using LevelViewType = typename IlukHandle::nnz_lno_view_t; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename IlukHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename IlukHandle::RangePolicy; + using sview_1d = typename Kokkos::View; + + static team_policy get_team_policy(const size_type nrows, + const int team_size) { + team_policy rv; + if (team_size == -1) { + rv = team_policy(nrows, Kokkos::AUTO); + } else { + rv = team_policy(nrows, team_size); } -#ifdef KEEP_DIAG - L_values(k2 - 1) = scalar_t(1.0); -#endif + return rv; + } - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - auto col = U_entries(k); - U_values(k) = 0.0; - iw(tid, col) = k; - } - - // Unpack the ith row of A - k1 = A_row_map(rowid); - k2 = A_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - auto col = A_entries(k); - auto ipos = iw(tid, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - } - - // Eliminate prev rows - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) { -#else - for (auto k = k1; k < k2; ++k) { -#endif - auto prev_row = L_entries(k); -#ifdef KEEP_DIAG - auto fact = L_values(k) / U_values(U_row_map(prev_row)); -#else - auto fact = L_values(k) * U_values(U_row_map(prev_row)); -#endif - L_values(k) = fact; - for (auto kk = U_row_map(prev_row) + 1; kk < U_row_map(prev_row + 1); - ++kk) { - auto col = U_entries(kk); - auto ipos = iw(tid, col); - if (ipos == -1) continue; - auto lxu = -U_values(kk) * fact; - if (col < rowid) - L_values(ipos) += lxu; - else - U_values(ipos) += lxu; - } // end for kk - } // end for k - -#ifdef KEEP_DIAG - if (U_values(iw(tid, rowid)) == 0.0) { - U_values(iw(tid, rowid)) = 1e6; - } -#else - if (U_values(iw(tid, rowid)) == 0.0) { - U_values(iw(tid, rowid)) = 1e6; + static team_policy get_team_policy(execution_space exe_space, + const size_type nrows, + const int team_size) { + team_policy rv; + if (team_size == -1) { + rv = team_policy(exe_space, nrows, Kokkos::AUTO); } else { - U_values(iw(tid, rowid)) = 1.0 / U_values(iw(tid, rowid)); + rv = team_policy(exe_space, nrows, team_size); } -#endif + return rv; + } - // Reset - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) -#else - for (auto k = k1; k < k2; ++k) -#endif - iw(tid, L_entries(k)) = -1; + static range_policy get_range_policy(const lno_t start, const lno_t end) { + range_policy rv(start, end); + return rv; + } - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) iw(tid, U_entries(k)) = -1; + static range_policy get_range_policy(execution_space exe_space, + const lno_t start, const lno_t end) { + range_policy rv(exe_space, start, end); + return rv; } -}; - -template -struct ILUKLvlSchedTP1NumericFunctor { - using execution_space = typename ARowMapType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - using size_type = typename ARowMapType::non_const_value_type; - using lno_t = typename AEntriesType::non_const_value_type; - using scalar_t = typename AValuesType::non_const_value_type; - - ARowMapType A_row_map; - AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; - LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; - UEntriesType U_entries; - UValuesType U_values; - LevelViewType level_idx; - WorkViewType iw; - nnz_lno_t lev_start; - - ILUKLvlSchedTP1NumericFunctor( - const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const nnz_lno_t &lev_start_) - : A_row_map(A_row_map_), - A_entries(A_entries_), - A_values(A_values_), - L_row_map(L_row_map_), - L_entries(L_entries_), - L_values(L_values_), - U_row_map(U_row_map_), - U_entries(U_entries_), - U_values(U_values_), - level_idx(level_idx_), - iw(iw_), - lev_start(lev_start_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - nnz_lno_t my_team = static_cast(team.league_rank()); - nnz_lno_t rowid = - static_cast(level_idx(my_team + lev_start)); // map to rowid - - size_type k1 = static_cast(L_row_map(rowid)); - size_type k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); -#else - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); -#endif -#ifdef KEEP_DIAG - // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); - Kokkos::single(Kokkos::PerTeam(team), - [&]() { L_values(k2 - 1) = scalar_t(1.0); }); -#endif + /** + * Common base class for SPILUK functors. Default version does not support + * blocks + */ + template + struct Common { + ARowMapType A_row_map; + AEntriesType A_entries; + AValuesType A_values; + LRowMapType L_row_map; + LEntriesType L_entries; + LValuesType L_values; + URowMapType U_row_map; + UEntriesType U_entries; + UValuesType U_values; + LevelViewType level_idx; + WorkViewType iw; + lno_t lev_start; + + // unblocked does not require any buffer + static constexpr size_type BUFF_SIZE = 1; + + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_) + : A_row_map(A_row_map_), + A_entries(A_entries_), + A_values(A_values_), + L_row_map(L_row_map_), + L_entries(L_entries_), + L_values(L_values_), + U_row_map(U_row_map_), + U_entries(U_entries_), + U_values(U_values_), + level_idx(level_idx_), + iw(iw_), + lev_start(lev_start_) { + KK_REQUIRE_MSG(block_size_ == 0, + "Tried to use blocks with the unblocked Common?"); + } - team.team_barrier(); - - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); - - team.team_barrier(); - - // Unpack the ith row of A - k1 = static_cast(A_row_map(rowid)); - k2 = static_cast(A_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t ipos = iw(my_team, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - }); - - team.team_barrier(); - - // Eliminate prev rows - k1 = static_cast(L_row_map(rowid)); - k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - for (size_type k = k1; k < k2 - 1; k++) -#else - for (size_type k = k1; k < k2; k++) -#endif - { - nnz_lno_t prev_row = L_entries(k); - - scalar_t fact = scalar_t(0.0); - Kokkos::single( - Kokkos::PerTeam(team), - [&](scalar_t &tmp_fact) { -#ifdef KEEP_DIAG - tmp_fact = L_values(k) / U_values(U_row_map(prev_row)); -#else - tmp_fact = L_values(k) * U_values(U_row_map(prev_row)); -#endif - L_values(k) = tmp_fact; - }, - fact); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, - U_row_map(prev_row + 1)), - [&](const size_type kk) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t ipos = iw(my_team, col); - auto lxu = -U_values(kk) * fact; - if (ipos != -1) { - if (col < rowid) - L_values(ipos) += lxu; - else - U_values(ipos) += lxu; - } - }); // end for kk + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type nnz, const scalar_t &value) const { + L_values(nnz) = value; + } + // uset + KOKKOS_INLINE_FUNCTION + void uset(const size_type nnz, const scalar_t &value) const { + U_values(nnz) = value; + } + + // lset_id + KOKKOS_INLINE_FUNCTION + void lset_id(const size_type nnz) const { L_values(nnz) = scalar_t(1.0); } + + KOKKOS_INLINE_FUNCTION + void lset_id(const member_type &team, const size_type nnz) const { + // Not sure a Kokkos::single is really needed here since the + // race is harmless + Kokkos::single(Kokkos::PerTeam(team), + [&]() { L_values(nnz) = scalar_t(1.0); }); + } + + // divide. lhs /= rhs + KOKKOS_INLINE_FUNCTION + void divide(scalar_t &lhs, const scalar_t &rhs) const { lhs /= rhs; } + + KOKKOS_INLINE_FUNCTION + void divide(const member_type &team, scalar_t &lhs, + const scalar_t &rhs) const { + Kokkos::single(Kokkos::PerTeam(team), [&]() { lhs /= rhs; }); team.team_barrier(); - } // end for k - - // if (my_thread == 0) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { - nnz_lno_t ipos = iw(my_team, rowid); -#ifdef KEEP_DIAG - if (U_values(ipos) == 0.0) { - U_values(ipos) = 1e6; + } + + // add. lhs += rhs + KOKKOS_INLINE_FUNCTION + void add(scalar_t &lhs, const scalar_t &rhs) const { lhs += rhs; } + + // multiply: return (alpha * lhs) * rhs + KOKKOS_INLINE_FUNCTION + scalar_t multiply(const scalar_t &alpha, const scalar_t &lhs, + const scalar_t &rhs, scalar_t *) const { + return alpha * lhs * rhs; + } + + // lget + KOKKOS_INLINE_FUNCTION + scalar_t &lget(const size_type nnz) const { return L_values(nnz); } + + // uget + KOKKOS_INLINE_FUNCTION + scalar_t &uget(const size_type nnz) const { return U_values(nnz); } + + // aget + KOKKOS_INLINE_FUNCTION + scalar_t aget(const size_type nnz) const { return A_values(nnz); } + + // uequal + KOKKOS_INLINE_FUNCTION + bool uequal(const size_type nnz, const scalar_t &value) const { + return U_values(nnz) == value; + } + }; + + // Partial specialization for block support + template + struct Common { + ARowMapType A_row_map; + AEntriesType A_entries; + AValuesType A_values; + LRowMapType L_row_map; + LEntriesType L_entries; + LValuesType L_values; + URowMapType U_row_map; + UEntriesType U_entries; + UValuesType U_values; + LevelViewType level_idx; + WorkViewType iw; + lno_t lev_start; + size_type block_size; + size_type block_items; + sview_1d ones; + + // blocked requires a buffer to store gemm output + static constexpr size_type BUFF_SIZE = 128; + + using LValuesUnmanaged2DBlockType = Kokkos::View< + typename LValuesType::value_type **, + typename KokkosKernels::Impl::GetUnifiedLayout< + LValuesType>::array_layout, + typename LValuesType::device_type, + Kokkos::MemoryTraits >; + + using UValuesUnmanaged2DBlockType = Kokkos::View< + typename UValuesType::value_type **, + typename KokkosKernels::Impl::GetUnifiedLayout< + UValuesType>::array_layout, + typename UValuesType::device_type, + Kokkos::MemoryTraits >; + + using AValuesUnmanaged2DBlockType = Kokkos::View< + typename AValuesType::value_type **, + typename KokkosKernels::Impl::GetUnifiedLayout< + AValuesType>::array_layout, + typename AValuesType::device_type, + Kokkos::MemoryTraits >; + + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_) + : A_row_map(A_row_map_), + A_entries(A_entries_), + A_values(A_values_), + L_row_map(L_row_map_), + L_entries(L_entries_), + L_values(L_values_), + U_row_map(U_row_map_), + U_entries(U_entries_), + U_values(U_values_), + level_idx(level_idx_), + iw(iw_), + lev_start(lev_start_), + block_size(block_size_), + block_items(block_size * block_size), + ones("ones", block_size) { + Kokkos::deep_copy(ones, 1.0); + KK_REQUIRE_MSG(block_size > 0, + "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); + } + + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type block, const scalar_t &value) const { + KokkosBlas::SerialSet::invoke(value, lget(block)); + } + + KOKKOS_INLINE_FUNCTION + void lset(const size_type block, + const AValuesUnmanaged2DBlockType &rhs) const { + auto lblock = lget(block); + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + lblock(i, j) = rhs(i, j); + } } -#else - if (U_values(ipos) == 0.0) { - U_values(ipos) = 1e6; - } else { - U_values(ipos) = 1.0 / U_values(ipos); + } + + // uset + KOKKOS_INLINE_FUNCTION + void uset(const size_type block, const scalar_t &value) const { + KokkosBlas::SerialSet::invoke(value, uget(block)); + } + + KOKKOS_INLINE_FUNCTION + void uset(const size_type block, + const AValuesUnmanaged2DBlockType &rhs) const { + auto ublock = uget(block); + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + ublock(i, j) = rhs(i, j); + } } -#endif - }); - //} - - team.team_barrier(); - - // Reset - k1 = static_cast(L_row_map(rowid)); - k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); -#else - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); -#endif + } - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(U_entries(k)); - iw(my_team, col) = -1; - }); - } -}; - -template -void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, - const AEntriesType &A_entries, const AValuesType &A_values, - const LRowMapType &L_row_map, const LEntriesType &L_entries, - LValuesType &L_values, const URowMapType &U_row_map, - const UEntriesType &U_entries, UValuesType &U_values) { - using execution_space = typename IlukHandle::execution_space; - using size_type = typename IlukHandle::size_type; - using nnz_lno_t = typename IlukHandle::nnz_lno_t; - using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = typename IlukHandle::work_view_t; - using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; - - size_type nlevels = thandle.get_num_levels(); - int team_size = thandle.get_team_size(); - - LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - - LevelHostViewType level_nchunks_h, level_nrowsperchunk_h; - WorkViewType iw; - - //{ - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_nchunks_h = thandle.get_level_nchunks(); - level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); - } - iw = thandle.get_iw(); + // lset_id + KOKKOS_INLINE_FUNCTION + void lset_id(const size_type block) const { + KokkosBatched::SerialSetIdentity::invoke(lget(block)); + } - // Main loop must be performed sequential. Question: Try out Cuda's graph - // stuff to reduce kernel launch overhead - for (size_type lvl = 0; lvl < nlevels; ++lvl) { - nnz_lno_t lev_start = level_ptr_h(lvl); - nnz_lno_t lev_end = level_ptr_h(lvl + 1); + KOKKOS_INLINE_FUNCTION + void lset_id(const member_type &team, const size_type block) const { + KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); + } - if ((lev_end - lev_start) != 0) { - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - Kokkos::RangePolicy(lev_start, lev_end), - ILUKLvlSchedRPNumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, - LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( - A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, level_idx, iw, lev_start)); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - - nnz_lno_t lvl_rowid_start = 0; - nnz_lno_t lvl_nrows_chunk; - for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { - if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > - (lev_end - lev_start)) - lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; - else - lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - - ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, level_idx, iw, - lev_start + lvl_rowid_start); - - if (team_size == -1) - Kokkos::parallel_for( - "parfor_tp1", policy_type(lvl_nrows_chunk, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for("parfor_tp1", - policy_type(lvl_nrows_chunk, team_size), tstf); - Kokkos::fence(); - lvl_rowid_start += lvl_nrows_chunk; + // divide. lhs /= rhs + KOKKOS_INLINE_FUNCTION + void divide(const LValuesUnmanaged2DBlockType &lhs, + const UValuesUnmanaged2DBlockType &rhs) const { + KokkosBatched::SerialTrsm< + KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, // not 100% on this + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>:: // not 100% on this + invoke(1.0, rhs, lhs); + } + + KOKKOS_INLINE_FUNCTION + void divide(const member_type &team, const LValuesUnmanaged2DBlockType &lhs, + const UValuesUnmanaged2DBlockType &rhs) const { + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, // not 100% on this + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>:: // not 100% on this + invoke(team, 1.0, rhs, lhs); + } + + // add. lhs += rhs + template + KOKKOS_INLINE_FUNCTION void add(Lview lhs, const Rview &rhs) const { + KokkosBatched::SerialAxpy::invoke(ones, rhs, lhs); + } + + // multiply: return (alpha * lhs) * rhs + KOKKOS_INLINE_FUNCTION + LValuesUnmanaged2DBlockType multiply(const scalar_t &alpha, + const UValuesUnmanaged2DBlockType &lhs, + const LValuesUnmanaged2DBlockType &rhs, + scalar_t *buff) const { + LValuesUnmanaged2DBlockType result(&buff[0], block_size, block_size); + KokkosBatched::SerialGemm:: + invoke( + alpha, lhs, rhs, 0.0, result); + return result; + } + + // lget + KOKKOS_INLINE_FUNCTION + LValuesUnmanaged2DBlockType lget(const size_type block) const { + return LValuesUnmanaged2DBlockType( + L_values.data() + (block * block_items), block_size, block_size); + } + + // uget + KOKKOS_INLINE_FUNCTION + UValuesUnmanaged2DBlockType uget(const size_type block) const { + return UValuesUnmanaged2DBlockType( + U_values.data() + (block * block_items), block_size, block_size); + } + + // aget + KOKKOS_INLINE_FUNCTION + AValuesUnmanaged2DBlockType aget(const size_type block) const { + return AValuesUnmanaged2DBlockType( + A_values.data() + (block * block_items), block_size, block_size); + } + + // uequal + KOKKOS_INLINE_FUNCTION + bool uequal(const size_type block, const scalar_t &value) const { + auto u_block = uget(block); + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + if (u_block(i, j) != value) { + return false; + } } } - } // end if - } // end for lvl - //} + return true; + } + }; + + template + struct ILUKLvlSchedRPNumericFunctor + : public Common { + using Base = Common; + + ILUKLvlSchedRPNumericFunctor( + const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_ = 0) + : Base(A_row_map_, A_entries_, A_values_, L_row_map_, L_entries_, + L_values_, U_row_map_, U_entries_, U_values_, level_idx_, iw_, + lev_start_, block_size_) {} + + KOKKOS_FUNCTION + void operator()(const lno_t i) const { + scalar_t buff[Base::BUFF_SIZE]; + + const auto rowid = Base::level_idx(i); + const auto tid = i - Base::lev_start; + auto k1 = Base::L_row_map(rowid); + auto k2 = Base::L_row_map(rowid + 1) - 1; + Base::lset_id(k2); + for (auto k = k1; k < k2; ++k) { + const auto col = Base::L_entries(k); + Base::lset(k, 0.0); + Base::iw(tid, col) = k; + } -// Output check -#ifdef NUMERIC_OUTPUT_INFO - std::cout << " iluk_numeric result: " << std::endl; + k1 = Base::U_row_map(rowid); + k2 = Base::U_row_map(rowid + 1); + for (auto k = k1; k < k2; ++k) { + const auto col = Base::U_entries(k); + Base::uset(k, 0.0); + Base::iw(tid, col) = k; + } - std::cout << " nnzL: " << thandle.get_nnzL() << std::endl; - std::cout << " L_row_map = "; - for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { - std::cout << L_row_map(i) << " "; - } - std::cout << std::endl; + k1 = Base::A_row_map(rowid); + k2 = Base::A_row_map(rowid + 1); + for (auto k = k1; k < k2; ++k) { + const auto col = Base::A_entries(k); + const auto ipos = Base::iw(tid, col); + if (col < rowid) { + Base::lset(ipos, Base::aget(k)); + } else { + Base::uset(ipos, Base::aget(k)); + } + } - std::cout << " L_entries = "; - for (size_type i = 0; i < thandle.get_nnzL(); ++i) { - std::cout << L_entries(i) << " "; - } - std::cout << std::endl; + // Eliminate prev rows + k1 = Base::L_row_map(rowid); + k2 = Base::L_row_map(rowid + 1) - 1; + for (auto k = k1; k < k2; ++k) { + const auto prev_row = Base::L_entries(k); + const auto u_diag = Base::uget(Base::U_row_map(prev_row)); + Base::divide(Base::lget(k), u_diag); + auto fact = Base::lget(k); + for (auto kk = Base::U_row_map(prev_row) + 1; + kk < Base::U_row_map(prev_row + 1); ++kk) { + const auto col = Base::U_entries(kk); + const auto ipos = Base::iw(tid, col); + if (ipos == -1) continue; + const auto lxu = Base::multiply(-1.0, Base::uget(kk), fact, &buff[0]); + if (col < rowid) { + Base::add(Base::lget(ipos), lxu); + } else { + Base::add(Base::uget(ipos), lxu); + } + } // end for kk + } // end for k + + const auto ipos = Base::iw(tid, rowid); + if (Base::uequal(ipos, 0.0)) { + Base::uset(ipos, 1e6); + } - std::cout << " L_values = "; - for (size_type i = 0; i < thandle.get_nnzL(); ++i) { - std::cout << L_values(i) << " "; - } - std::cout << std::endl; + // Reset + k1 = Base::L_row_map(rowid); + k2 = Base::L_row_map(rowid + 1) - 1; + for (auto k = k1; k < k2; ++k) Base::iw(tid, Base::L_entries(k)) = -1; - std::cout << " nnzU: " << thandle.get_nnzU() << std::endl; - std::cout << " U_row_map = "; - for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { - std::cout << U_row_map(i) << " "; - } - std::cout << std::endl; + k1 = Base::U_row_map(rowid); + k2 = Base::U_row_map(rowid + 1); + for (auto k = k1; k < k2; ++k) Base::iw(tid, Base::U_entries(k)) = -1; + } + }; + + template + struct ILUKLvlSchedTP1NumericFunctor + : public Common { + using Base = Common; + + ILUKLvlSchedTP1NumericFunctor( + const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_ = 0) + : Base(A_row_map_, A_entries_, A_values_, L_row_map_, L_entries_, + L_values_, U_row_map_, U_entries_, U_values_, level_idx_, iw_, + lev_start_, block_size_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + const auto my_team = team.league_rank(); + const auto rowid = + Base::level_idx(my_team + Base::lev_start); // map to rowid + size_type k1 = Base::L_row_map(rowid); + size_type k2 = Base::L_row_map(rowid + 1) - 1; + Base::lset_id(team, k2); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::L_entries(k); + Base::lset(k, 0.0); + Base::iw(my_team, col) = k; + }); - std::cout << " U_entries = "; - for (size_type i = 0; i < thandle.get_nnzU(); ++i) { - std::cout << U_entries(i) << " "; - } - std::cout << std::endl; + team.team_barrier(); - std::cout << " U_values = "; - for (size_type i = 0; i < thandle.get_nnzU(); ++i) { - std::cout << U_values(i) << " "; - } - std::cout << std::endl; -#endif + k1 = Base::U_row_map(rowid); + k2 = Base::U_row_map(rowid + 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::U_entries(k); + Base::uset(k, 0.0); + Base::iw(my_team, col) = k; + }); + + team.team_barrier(); + + // Unpack the ith row of A + k1 = Base::A_row_map(rowid); + k2 = Base::A_row_map(rowid + 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::A_entries(k); + const auto ipos = Base::iw(my_team, col); + if (col < rowid) { + Base::lset(ipos, Base::aget(k)); + } else { + Base::uset(ipos, Base::aget(k)); + } + }); + + team.team_barrier(); + + // Eliminate prev rows + k1 = Base::L_row_map(rowid); + k2 = Base::L_row_map(rowid + 1) - 1; + for (auto k = k1; k < k2; k++) { + const auto prev_row = Base::L_entries(k); + const auto udiag = Base::uget(Base::U_row_map(prev_row)); + Base::divide(team, Base::lget(k), udiag); + auto fact = Base::lget(k); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, Base::U_row_map(prev_row) + 1, + Base::U_row_map(prev_row + 1)), + [&](const size_type kk) { + const auto col = Base::U_entries(kk); + const auto ipos = Base::iw(my_team, col); + if (ipos != -1) { + scalar_t buff[Base::BUFF_SIZE]; + auto lxu = Base::multiply(-1.0, Base::uget(kk), fact, &buff[0]); + if (col < rowid) { + Base::add(Base::lget(ipos), lxu); + } else { + Base::add(Base::uget(ipos), lxu); + } + } + }); // end for kk + + team.team_barrier(); + } // end for k + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + const auto ipos = Base::iw(my_team, rowid); + if (Base::uequal(ipos, 0.0)) { + Base::uset(ipos, 1e6); + } + }); + + team.team_barrier(); -} // end iluk_numeric - -template -void iluk_numeric_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &A_row_map_v, - const std::vector &A_entries_v, - const std::vector &A_values_v, - const std::vector &L_row_map_v, - const std::vector &L_entries_v, - std::vector &L_values_v, - const std::vector &U_row_map_v, - const std::vector &U_entries_v, - std::vector &U_values_v) { - using size_type = typename IlukHandle::size_type; - using nnz_lno_t = typename IlukHandle::nnz_lno_t; - using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = typename IlukHandle::work_view_t; - using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector lvl_ptr_h_v(nstreams); - std::vector lvl_idx_v(nstreams); // device views - std::vector lvl_start_v(nstreams); - std::vector lvl_end_v(nstreams); - std::vector iw_v(nstreams); // device views - std::vector stream_have_level_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); - lvl_idx_v[i] = thandle_v[i]->get_level_idx(); - iw_v[i] = thandle_v[i]->get_iw(); - stream_have_level_v[i] = true; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + // Reset + k1 = Base::L_row_map(rowid); + k2 = Base::L_row_map(rowid + 1) - 1; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::L_entries(k); + Base::iw(my_team, col) = -1; + }); + + k1 = Base::U_row_map(rowid); + k2 = Base::U_row_map(rowid + 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::U_entries(k); + Base::iw(my_team, col) = -1; + }); + } + }; + +#define FunctorTypeMacro(Functor, BlockEnabled) \ + Functor + +#define KernelLaunchMacro(arow, aent, aval, lrow, lent, lval, urow, uent, \ + uval, polc, name, lidx, iwv, lstrt, ftf, ftb, be, \ + bs) \ + if (be) { \ + ftb functor(arow, aent, aval, lrow, lent, lval, urow, uent, uval, lidx, \ + iwv, lstrt, bs); \ + Kokkos::parallel_for(name, polc, functor); \ + } else { \ + ftf functor(arow, aent, aval, lrow, lent, lval, urow, uent, uval, lidx, \ + iwv, lstrt); \ + Kokkos::parallel_for(name, polc, functor); \ } - // Assume all streams use the same algorithm - if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // Initial work across streams at each level - for (int i = 0; i < nstreams; i++) { - // Only do this if this stream has this level - if (lvl < nlevels_v[i]) { - lvl_start_v[i] = lvl_ptr_h_v[i](lvl); - lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); - if ((lvl_end_v[i] - lvl_start_v[i]) != 0) - stream_have_level_v[i] = true; - else - stream_have_level_v[i] = false; - } else - stream_have_level_v[i] = false; - } + template + static void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, + const AEntriesType &A_entries, + const AValuesType &A_values, + const LRowMapType &L_row_map, + const LEntriesType &L_entries, LValuesType &L_values, + const URowMapType &U_row_map, + const UEntriesType &U_entries, + UValuesType &U_values) { + using RPF = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, false); + using RPB = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, true); + using TPF = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, false); + using TPB = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, true); + + size_type nlevels = thandle.get_num_levels(); + int team_size = thandle.get_team_size(); + const auto block_size = thandle.get_block_size(); + + LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); + LevelViewType level_idx = thandle.get_level_idx(); + + LevelHostViewType level_nchunks_h, level_nrowsperchunk_h; + WorkViewType iw; + + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + level_nchunks_h = thandle.get_level_nchunks(); + level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); + } + iw = thandle.get_iw(); + + // Main loop must be performed sequential. Question: Try out Cuda's graph + // stuff to reduce kernel launch overhead + for (size_type lvl = 0; lvl < nlevels; ++lvl) { + lno_t lev_start = level_ptr_h(lvl); + lno_t lev_end = level_ptr_h(lvl + 1); + + if ((lev_end - lev_start) != 0) { + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { + range_policy rpolicy = get_range_policy(lev_start, lev_end); + KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, U_row_map, U_entries, U_values, + rpolicy, "parfor_fixed_lvl", level_idx, iw, + lev_start, RPF, RPB, thandle.is_block_enabled(), + block_size); + } else if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm:: + SEQLVLSCHD_TP1) { + lno_t lvl_rowid_start = 0; + lno_t lvl_nrows_chunk; + for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { + if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > + (lev_end - lev_start)) + lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + + team_policy tpolicy = get_team_policy(lvl_nrows_chunk, team_size); + KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, U_row_map, U_entries, + U_values, tpolicy, "parfor_tp1", level_idx, iw, + lev_start + lvl_rowid_start, TPF, TPB, + thandle.is_block_enabled(), block_size); + Kokkos::fence(); + lvl_rowid_start += lvl_nrows_chunk; + } + } + } // end if + } // end for lvl + //} - // Main work of the level across streams - // 1. Launch work on all streams - for (int i = 0; i < nstreams; i++) { - // Launch only if stream i-th has this level - if (stream_have_level_v[i]) { - ILUKLvlSchedRPNumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], - iw_v[i], lvl_start_v[i]); - Kokkos::parallel_for( - "parfor_rp", - Kokkos::RangePolicy(execspace_v[i], - lvl_start_v[i], lvl_end_v[i]), - tstf); - } // end if (stream_have_level_v[i]) - } // end for streams - } // end for lvl - } // end SEQLVLSCHD_RP - else if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - - std::vector lvl_nchunks_h_v(nstreams); - std::vector lvl_nrowsperchunk_h_v(nstreams); - std::vector lvl_rowid_start_v(nstreams); - std::vector team_size_v(nstreams); +// Output check +#ifdef NUMERIC_OUTPUT_INFO + std::cout << " iluk_numeric result: " << std::endl; + + std::cout << " nnzL: " << thandle.get_nnzL() << std::endl; + std::cout << " L_row_map = "; + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { + std::cout << L_row_map(i) << " "; + } + std::cout << std::endl; + + std::cout << " L_entries = "; + for (size_type i = 0; i < thandle.get_nnzL(); ++i) { + std::cout << L_entries(i) << " "; + } + std::cout << std::endl; + + std::cout << " L_values = "; + for (size_type i = 0; i < thandle.get_nnzL(); ++i) { + std::cout << L_values(i) << " "; + } + std::cout << std::endl; + + std::cout << " nnzU: " << thandle.get_nnzU() << std::endl; + std::cout << " U_row_map = "; + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { + std::cout << U_row_map(i) << " "; + } + std::cout << std::endl; + + std::cout << " U_entries = "; + for (size_type i = 0; i < thandle.get_nnzU(); ++i) { + std::cout << U_entries(i) << " "; + } + std::cout << std::endl; + + std::cout << " U_values = "; + for (size_type i = 0; i < thandle.get_nnzU(); ++i) { + std::cout << U_values(i) << " "; + } + std::cout << std::endl; +#endif + } // end iluk_numeric + + template + static void iluk_numeric_streams( + const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v) { + using RPF = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, false); + using RPB = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, true); + using TPF = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, false); + using TPB = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, true); + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector lvl_ptr_h_v(nstreams); + std::vector lvl_idx_v(nstreams); // device views + std::vector lvl_start_v(nstreams); + std::vector lvl_end_v(nstreams); + std::vector iw_v(nstreams); // device views + std::vector stream_have_level_v(nstreams); + std::vector is_block_enabled_v(nstreams); + std::vector block_size_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); - lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); - team_size_v[i] = thandle_v[i]->get_team_size(); + nlevels_v[i] = thandle_v[i]->get_num_levels(); + lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); + lvl_idx_v[i] = thandle_v[i]->get_level_idx(); + iw_v[i] = thandle_v[i]->get_iw(); + is_block_enabled_v[i] = thandle_v[i]->is_block_enabled(); + block_size_v[i] = thandle_v[i]->get_block_size(); + stream_have_level_v[i] = true; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; } - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // Initial work across streams at each level - nnz_lno_t lvl_nchunks_max = 0; - for (int i = 0; i < nstreams; i++) { - // Only do this if this stream has this level - if (lvl < nlevels_v[i]) { - lvl_start_v[i] = lvl_ptr_h_v[i](lvl); - lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); - if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { - stream_have_level_v[i] = true; - lvl_rowid_start_v[i] = 0; - if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) - lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); + // Assume all streams use the same algorithm + if (thandle_v[0]->get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + for (int i = 0; i < nstreams; i++) { + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) + stream_have_level_v[i] = true; + else + stream_have_level_v[i] = false; } else stream_have_level_v[i] = false; - } else - stream_have_level_v[i] = false; - } + } - // Main work of the level across streams -- looping through chunnks - for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { - // 1. Launch work on all streams (for each chunk) + // Main work of the level across streams + // 1. Launch work on all streams for (int i = 0; i < nstreams; i++) { // Launch only if stream i-th has this level if (stream_have_level_v[i]) { - // Launch only if stream i-th has this chunk - if (chunkid < lvl_nchunks_h_v[i](lvl)) { - // 1.a. Specify number of rows (i.e. number of teams) to launch - nnz_lno_t lvl_nrows_chunk = 0; - if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > - (lvl_end_v[i] - lvl_start_v[i])) - lvl_nrows_chunk = - (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; - else - lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); - - // 1.b. Create functor for stream i-th and launch - ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, - LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], - lvl_idx_v[i], iw_v[i], - lvl_start_v[i] + lvl_rowid_start_v[i]); - if (team_size_v[i] == -1) - Kokkos::parallel_for( - "parfor_tp1", - policy_type(execspace_v[i], lvl_nrows_chunk, Kokkos::AUTO), - tstf); - else - Kokkos::parallel_for( - "parfor_tp1", - policy_type(execspace_v[i], lvl_nrows_chunk, - team_size_v[i]), - tstf); - - // 1.c. Ready to move to next chunk - lvl_rowid_start_v[i] += lvl_nrows_chunk; - } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) - } // end if (stream_have_level_v[i]) - } // end for streams - } // end for chunkid - } // end for lvl - } // end SEQLVLSCHD_TP1 - -} // end iluk_numeric_streams + range_policy rpolicy = + get_range_policy(execspace_v[i], lvl_start_v[i], lvl_end_v[i]); + KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + rpolicy, "parfor_rp", lvl_idx_v[i], iw_v[i], + lvl_start_v[i], RPF, RPB, is_block_enabled_v[i], + block_size_v[i]); + } // end if (stream_have_level_v[i]) + } // end for streams + } // end for lvl + } // end SEQLVLSCHD_RP + else if (thandle_v[0]->get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + std::vector lvl_nchunks_h_v(nstreams); + std::vector lvl_nrowsperchunk_h_v(nstreams); + std::vector lvl_rowid_start_v(nstreams); + std::vector team_size_v(nstreams); + + for (int i = 0; i < nstreams; i++) { + lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); + lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); + team_size_v[i] = thandle_v[i]->get_team_size(); + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + lno_t lvl_nchunks_max = 0; + for (int i = 0; i < nstreams; i++) { + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { + stream_have_level_v[i] = true; + lvl_rowid_start_v[i] = 0; + if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) + lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); + } else + stream_have_level_v[i] = false; + } else + stream_have_level_v[i] = false; + } + + // Main work of the level across streams -- looping through chunnks + for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { + // 1. Launch work on all streams (for each chunk) + for (int i = 0; i < nstreams; i++) { + // Launch only if stream i-th has this level + if (stream_have_level_v[i]) { + // Launch only if stream i-th has this chunk + if (chunkid < lvl_nchunks_h_v[i](lvl)) { + // 1.a. Specify number of rows (i.e. number of teams) to launch + lno_t lvl_nrows_chunk = 0; + if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > + (lvl_end_v[i] - lvl_start_v[i])) + lvl_nrows_chunk = + (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; + else + lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); + + // 1.b. Create functor for stream i-th and launch + team_policy tpolicy = get_team_policy( + execspace_v[i], lvl_nrows_chunk, team_size_v[i]); + KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + tpolicy, "parfor_tp1", lvl_idx_v[i], iw_v[i], + lvl_start_v[i] + lvl_rowid_start_v[i], TPF, + TPB, is_block_enabled_v[i], block_size_v[i]); + // 1.c. Ready to move to next chunk + lvl_rowid_start_v[i] += lvl_nrows_chunk; + } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) + } // end if (stream_have_level_v[i]) + } // end for streams + } // end for chunkid + } // end for lvl + } // end SEQLVLSCHD_TP1 + + } // end iluk_numeric_streams + +}; // IlukWrap } // namespace Experimental } // namespace Impl } // namespace KokkosSparse +#undef FunctorTypeMacro +#undef KernelLaunchMacro + #endif diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index 12f8c43caf..f58f691e89 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -145,6 +145,8 @@ struct SPILUK_NUMERIC { + using Iluk = Experimental::IlukWrap; + static void spiluk_numeric( KernelHandle *handle, const typename KernelHandle::const_nnz_lno_t & /*fill_lev*/, @@ -155,9 +157,9 @@ struct SPILUK_NUMERICget_spiluk_handle(); - Experimental::iluk_numeric(*spiluk_handle, A_row_map, A_entries, A_values, - L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values); + Iluk::iluk_numeric(*spiluk_handle, A_row_map, A_entries, A_values, + L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values); } static void spiluk_numeric_streams( @@ -178,10 +180,10 @@ struct SPILUK_NUMERIC static_cast(L_entries_d.extent(0))) { -#else - if (cntL + lenl > static_cast(L_entries_d.extent(0))) { -#endif // size_type newsize = (size_type) (L_entries_d.extent(0)*EXPAND_FACT); // Kokkos::resize(L_entries, newsize); // Kokkos::resize(L_entries_d, newsize); @@ -412,11 +408,9 @@ void iluk_symbolic(IlukHandle& thandle, L_entries(cntL) = h_iL(k); cntL++; } -#ifdef KEEP_DIAG // L diag entry L_entries(cntL) = i; cntL++; -#endif L_row_map(i + 1) = cntL; } // End main loop i diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 4f33795018..6a5b5b6f20 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -947,11 +947,13 @@ class KokkosKernelsHandle { SPILUKHandleType *get_spiluk_handle() { return this->spilukHandle; } void create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm algm, - size_type nrows, size_type nnzL, size_type nnzU) { + size_type nrows, size_type nnzL, size_type nnzU, + size_type block_size = 0) { this->destroy_spiluk_handle(); this->is_owner_of_the_spiluk_handle = true; - this->spilukHandle = new SPILUKHandleType(algm, nrows, nnzL, nnzU); - this->spilukHandle->reset_handle(nrows, nnzL, nnzU); + this->spilukHandle = + new SPILUKHandleType(algm, nrows, nnzL, nnzU, block_size); + this->spilukHandle->reset_handle(nrows, nnzL, nnzU, block_size); this->spilukHandle->set_team_size(this->team_work_size); this->spilukHandle->set_vector_size(this->vector_size); } diff --git a/sparse/src/KokkosSparse_spiluk.hpp b/sparse/src/KokkosSparse_spiluk.hpp index 1bf78abe5e..b3644a8709 100644 --- a/sparse/src/KokkosSparse_spiluk.hpp +++ b/sparse/src/KokkosSparse_spiluk.hpp @@ -530,7 +530,6 @@ void spiluk_numeric(KernelHandle* handle, A_entries_i, A_values_i, L_rowmap_i, L_entries_i, L_values_i, U_rowmap_i, U_entries_i, U_values_i); - } // spiluk_numeric template class SPILUKHandle { public: - typedef ExecutionSpace HandleExecSpace; - typedef TemporaryMemorySpace HandleTempMemorySpace; - typedef PersistentMemorySpace HandlePersistentMemorySpace; + using HandleExecSpace = ExecutionSpace; + using HandleTempMemorySpace = TemporaryMemorySpace; + using HandlePersistentMemorySpace = PersistentMemorySpace; - typedef ExecutionSpace execution_space; - typedef HandlePersistentMemorySpace memory_space; + using execution_space = ExecutionSpace; + using memory_space = HandlePersistentMemorySpace; - typedef typename std::remove_const::type size_type; - typedef const size_type const_size_type; + using TeamPolicy = Kokkos::TeamPolicy; + using RangePolicy = Kokkos::RangePolicy; - typedef typename std::remove_const::type nnz_lno_t; - typedef const nnz_lno_t const_nnz_lno_t; + using size_type = typename std::remove_const::type; + using const_size_type = const size_type; - typedef typename std::remove_const::type nnz_scalar_t; - typedef const nnz_scalar_t const_nnz_scalar_t; + using nnz_lno_t = typename std::remove_const::type; + using const_nnz_lno_t = const nnz_lno_t; - typedef typename Kokkos::View - nnz_row_view_t; + using nnz_scalar_t = typename std::remove_const::type; + using const_nnz_scalar_t = const nnz_scalar_t; - typedef typename Kokkos::View - nnz_lno_view_t; + using nnz_row_view_t = Kokkos::View; - typedef typename Kokkos::View - nnz_row_view_host_t; + using nnz_lno_view_t = Kokkos::View; - typedef typename Kokkos::View - nnz_lno_view_host_t; + using nnz_value_view_t = + typename Kokkos::View; - typedef typename std::make_signed< - typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; - typedef Kokkos::View - signed_nnz_lno_view_t; + using nnz_row_view_host_t = + typename Kokkos::View; - typedef Kokkos::View - work_view_t; + using nnz_lno_view_host_t = + typename Kokkos::View; + + using signed_integral_t = typename std::make_signed< + typename nnz_row_view_t::non_const_value_type>::type; + using signed_nnz_lno_view_t = + Kokkos::View; + + using work_view_t = Kokkos::View; private: nnz_row_view_t level_list; // level IDs which the rows belong to @@ -95,6 +96,7 @@ class SPILUKHandle { size_type nlevels; size_type nnzL; size_type nnzU; + size_type block_size; size_type level_maxrows; // max. number of rows among levels size_type level_maxrowsperchunk; // max.number of rows among chunks among levels @@ -109,7 +111,7 @@ class SPILUKHandle { public: SPILUKHandle(SPILUKAlgorithm choice, const size_type nrows_, const size_type nnzL_, const size_type nnzU_, - bool symbolic_complete_ = false) + const size_type block_size_ = 0, bool symbolic_complete_ = false) : level_list(), level_idx(), level_ptr(), @@ -121,6 +123,7 @@ class SPILUKHandle { nlevels(0), nnzL(nnzL_), nnzU(nnzU_), + block_size(block_size_), level_maxrows(0), level_maxrowsperchunk(0), symbolic_complete(symbolic_complete_), @@ -129,11 +132,12 @@ class SPILUKHandle { vector_size(-1) {} void reset_handle(const size_type nrows_, const size_type nnzL_, - const size_type nnzU_) { + const size_type nnzU_, const size_type block_size_) { set_nrows(nrows_); set_num_levels(0); set_nnzL(nnzL_); set_nnzU(nnzU_); + set_block_size(block_size_); set_level_maxrows(0); set_level_maxrowsperchunk(0); level_list = nnz_row_view_t("level_list", nrows_), @@ -205,6 +209,14 @@ class SPILUKHandle { KOKKOS_INLINE_FUNCTION void set_nnzU(const size_type nnzU_) { this->nnzU = nnzU_; } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + + KOKKOS_INLINE_FUNCTION + void set_block_size(const size_type block_size_) { + this->block_size = block_size_; + } + KOKKOS_INLINE_FUNCTION size_type get_level_maxrows() const { return level_maxrows; } @@ -223,6 +235,8 @@ class SPILUKHandle { bool is_symbolic_complete() const { return symbolic_complete; } + bool is_block_enabled() const { return block_size > 0; } + size_type get_num_levels() const { return nlevels; } void set_num_levels(size_type nlevels_) { this->nlevels = nlevels_; } diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 4370ebe37e..cda09d0639 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -29,6 +29,8 @@ #include "KokkosSparse_LUPrec.hpp" #include "KokkosSparse_SortCrs.hpp" +#include "Test_vector_fixtures.hpp" + #include using namespace KokkosSparse; @@ -52,69 +54,6 @@ struct TolMeta { } // namespace ParIlut -template -std::vector> decompress_matrix( - Kokkos::View& row_map, - Kokkos::View& entries, - Kokkos::View& values) { - const size_type nrows = row_map.size() - 1; - std::vector> result; - result.resize(nrows); - for (auto& row : result) { - row.resize(nrows, 0.0); - } - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - Kokkos::deep_copy(hrow_map, row_map); - Kokkos::deep_copy(hentries, entries); - Kokkos::deep_copy(hvalues, values); - - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - const size_type row_nnz_begin = hrow_map(row_idx); - const size_type row_nnz_end = hrow_map(row_idx + 1); - for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { - const lno_t col_idx = hentries(row_nnz); - const scalar_t value = hvalues(row_nnz); - result[row_idx][col_idx] = value; - } - } - - return result; -} - -template -void check_matrix(const std::string& name, - Kokkos::View& row_map, - Kokkos::View& entries, - Kokkos::View& values, - const std::vector>& expected) { - const auto decompressed_mtx = decompress_matrix(row_map, entries, values); - - const size_type nrows = row_map.size() - 1; - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { - EXPECT_NEAR(expected[row_idx][col_idx], - decompressed_mtx[row_idx][col_idx], 0.01) - << "Failed check is: " << name << "[" << row_idx << "][" << col_idx - << "]"; - } - } -} - -template -void print_matrix(const std::vector>& matrix) { - for (const auto& row : matrix) { - for (const auto& item : row) { - std::printf("%.2f ", item); - } - std::cout << std::endl; - } -} - template void run_test_par_ilut() { @@ -131,47 +70,14 @@ void run_test_par_ilut() { {0.5, -3., 6., 0.}, {0.2, -0.5, -9., 0.}}; - const scalar_t ZERO = scalar_t(0); - - const size_type nrows = A.size(); - - // Count A nnz's - size_type nnz = 0; - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { - if (A[row_idx][col_idx] != ZERO) { - ++nnz; - } - } - } - // Allocate device CRS views for A - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - // Create host mirror views for CRS A - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); + RowMapType row_map("row_map", 0); + EntriesType entries("entries", 0); + ValuesType values("values", 0); - // Compress A into CRS (host views) - size_type curr_nnz = 0; - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { - if (A[row_idx][col_idx] != ZERO) { - hentries(curr_nnz) = col_idx; - hvalues(curr_nnz) = A[row_idx][col_idx]; - ++curr_nnz; - } - hrow_map(row_idx + 1) = curr_nnz; - } - } + compress_matrix(row_map, entries, values, A); - // Copy host A CRS views to device A CRS views - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); + const size_type nrows = A.size(); // Make kernel handle KernelHandle kh; diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 77cdb1ede1..7d52d08ee6 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -26,129 +26,154 @@ #include "KokkosBlas1_nrm2.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_spiluk.hpp" +#include "KokkosSparse_crs_to_bsr_impl.hpp" +#include "KokkosSparse_bsr_to_crs_impl.hpp" -#include +#include "Test_vector_fixtures.hpp" + +#include using namespace KokkosSparse; using namespace KokkosSparse::Experimental; using namespace KokkosKernels; using namespace KokkosKernels::Experimental; -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #define kokkos_complex_float Kokkos::complex -// #endif - -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; namespace Test { +template +std::vector> get_9x9_fixture() { + std::vector> A = { + {10.00, 0.00, 0.30, 0.00, 0.00, 0.60, 0.00, 0.00, 0.00}, + {0.00, 11.00, 0.00, 0.00, 0.00, 0.00, 0.70, 0.00, 0.00}, + {0.00, 0.00, 12.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00}, + {5.00, 0.00, 0.00, 13.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {4.00, 0.00, 0.00, 0.00, 14.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 3.00, 0.00, 0.00, 0.00, 15.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 7.00, 0.00, 0.00, 0.00, 16.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 6.00, 5.00, 0.00, 0.00, 17.00, 0.00}, + {0.00, 0.00, 0.00, 2.00, 2.50, 0.00, 0.00, 0.00, 18.00}}; + return A; +} + +template +std::vector> get_4x4_fixture() { + std::vector> A = {{10.00, 1.00, 0.00, 0.00}, + {0.00, 11.00, 0.00, 0.00}, + {0.00, 2.00, 12.00, 0.00}, + {5.00, 0.00, 0.00, 13.00}}; + return A; +} + +static constexpr double EPS = 1e-7; + template -void run_test_spiluk() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; - typedef Kokkos::ArithTraits AT; - - const size_type nrows = 9; - const size_type nnz = 21; - - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - scalar_t MONE = scalar_t(-1); - - hrow_map(0) = 0; - hrow_map(1) = 3; - hrow_map(2) = 5; - hrow_map(3) = 6; - hrow_map(4) = 9; - hrow_map(5) = 11; - hrow_map(6) = 13; - hrow_map(7) = 15; - hrow_map(8) = 18; - hrow_map(9) = nnz; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 5; - hentries(3) = 1; - hentries(4) = 6; - hentries(5) = 2; - hentries(6) = 0; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 0; - hentries(10) = 4; - hentries(11) = 1; - hentries(12) = 5; - hentries(13) = 2; - hentries(14) = 6; - hentries(15) = 3; - hentries(16) = 4; - hentries(17) = 7; - hentries(18) = 3; - hentries(19) = 4; - hentries(20) = 8; - - hvalues(0) = 10; - hvalues(1) = 0.3; - hvalues(2) = 0.6; - hvalues(3) = 11; - hvalues(4) = 0.7; - hvalues(5) = 12; - hvalues(6) = 5; - hvalues(7) = 13; - hvalues(8) = 1; - hvalues(9) = 4; - hvalues(10) = 14; - hvalues(11) = 3; - hvalues(12) = 15; - hvalues(13) = 7; - hvalues(14) = 16; - hvalues(15) = 6; - hvalues(16) = 5; - hvalues(17) = 17; - hvalues(18) = 2; - hvalues(19) = 2.5; - hvalues(20) = 18; - - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< +struct SpilukTest { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using AT = Kokkos::ArithTraits; + + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> - KernelHandle; + typename device::memory_space, typename device::memory_space>; + + using Crs = CrsMatrix; + using Bsr = BsrMatrix; + + template + static typename AT::mag_type check_result_impl( + const AType& A, const LType& L, const UType& U, const size_type nrows, + const size_type block_size = 1) { + const scalar_t ZERO = scalar_t(0); + const scalar_t ONE = scalar_t(1); + const scalar_t MONE = scalar_t(-1); - KernelHandle kh; + // Create a reference view e set to all 1's + ValuesType e_one("e_one", nrows * block_size); + Kokkos::deep_copy(e_one, ONE); + + // Create two views for spmv results + ValuesType bb("bb", nrows * block_size); + ValuesType bb_tmp("bb_tmp", nrows * block_size); + + // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) + KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + + typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + + KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); + KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + + typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + + return diff_nrm / bb_nrm; + } + + static void check_result(const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const RowMapType& L_row_map, + const EntriesType& L_entries, + const ValuesType& L_values, + const RowMapType& U_row_map, + const EntriesType& U_entries, + const ValuesType& U_values) { + // Checking + const auto nrows = row_map.extent(0) - 1; + Crs A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries); + Crs L("L_Mtx", nrows, nrows, L_values.extent(0), L_values, L_row_map, + L_entries); + Crs U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, + U_entries); - // SPILUKAlgorithm::SEQLVLSCHD_RP - { - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4 * nrows, - 4 * nrows); + const auto result = check_result_impl(A, L, U, nrows); + + EXPECT_LT(result, 1e-4); + } + + static void check_result_block( + const RowMapType& row_map, const EntriesType& entries, + const ValuesType& values, const RowMapType& L_row_map, + const EntriesType& L_entries, const ValuesType& L_values, + const RowMapType& U_row_map, const EntriesType& U_entries, + const ValuesType& U_values, const size_type block_size) { + // Checking + const auto nrows = row_map.extent(0) - 1; + Bsr A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries, + block_size); + Bsr L("L_Mtx", nrows, nrows, L_values.extent(0), L_values, L_row_map, + L_entries, block_size); + Bsr U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, + U_entries, block_size); + + const auto result = check_result_impl(A, L, U, nrows, block_size); + EXPECT_LT(result, 1e0); + } + + static std::tuple + run_and_check_spiluk(KernelHandle& kh, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + SPILUKAlgorithm alg, const lno_t fill_lev) { + const size_type nrows = row_map.extent(0) - 1; + kh.create_spiluk_handle(alg, nrows, 4 * nrows, 4 * nrows); auto spiluk_handle = kh.get_spiluk_handle(); // Allocate L and U as outputs RowMapType L_row_map("L_row_map", nrows + 1); EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); RowMapType U_row_map("U_row_map", nrows + 1); EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, U_row_map, U_entries); @@ -156,63 +181,54 @@ void run_test_spiluk() { Kokkos::fence(); Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(L_values, spiluk_handle->get_nnzL()); Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - Kokkos::resize(U_values, spiluk_handle->get_nnzU()); + ValuesType L_values("L_values", spiluk_handle->get_nnzL()); + ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - spiluk_handle->print_algorithm(); spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); Kokkos::fence(); - // Checking - typedef CrsMatrix crsMat_t; - crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, - L_row_map, L_entries); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, - U_row_map, U_entries); - - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); - - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); - - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); - - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + check_result(row_map, entries, values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values); - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); - - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + kh.destroy_spiluk_handle(); - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + // For team policy alg, check results against range policy + if (alg == SPILUKAlgorithm::SEQLVLSCHD_TP1) { + const auto [L_row_map_rp, L_entries_rp, L_values_rp, U_row_map_rp, + U_entries_rp, U_values_rp] = + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev); + + EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_rp, EPS); + EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_rp, EPS); + EXPECT_NEAR_KK_1DVIEW(L_values, L_values_rp, EPS); + EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_rp, EPS); + EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_rp, EPS); + EXPECT_NEAR_KK_1DVIEW(U_values, U_values_rp, EPS); + } - kh.destroy_spiluk_handle(); + return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values); } - // SPILUKAlgorithm::SEQLVLSCHD_TP1 - { - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4 * nrows, - 4 * nrows); + static void run_and_check_spiluk_block( + KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, + const ValuesType& values, SPILUKAlgorithm alg, const lno_t fill_lev, + const size_type block_size) { + const size_type block_items = block_size * block_size; + const size_type nrows = row_map.extent(0) - 1; + kh.create_spiluk_handle(alg, nrows, 4 * nrows, 4 * nrows, block_size); auto spiluk_handle = kh.get_spiluk_handle(); // Allocate L and U as outputs RowMapType L_row_map("L_row_map", nrows + 1); EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); RowMapType U_row_map("U_row_map", nrows + 1); EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, U_row_map, U_entries); @@ -220,292 +236,372 @@ void run_test_spiluk() { Kokkos::fence(); Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(L_values, spiluk_handle->get_nnzL()); Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - Kokkos::resize(U_values, spiluk_handle->get_nnzU()); + ValuesType L_values("L_values", spiluk_handle->get_nnzL() * block_items); + ValuesType U_values("U_values", spiluk_handle->get_nnzU() * block_items); - spiluk_handle->print_algorithm(); spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); Kokkos::fence(); - // Checking - typedef CrsMatrix crsMat_t; - crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, - L_row_map, L_entries); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, - U_row_map, U_entries); + check_result_block(row_map, entries, values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, block_size); - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); + kh.destroy_spiluk_handle(); - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); + // If block_size is 1, results should exactly match unblocked results + if (block_size == 1) { + const auto [L_row_map_u, L_entries_u, L_values_u, U_row_map_u, + U_entries_u, U_values_u] = + run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev); + + EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_u, EPS); + EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_u, EPS); + EXPECT_NEAR_KK_1DVIEW(L_values, L_values_u, EPS); + EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_u, EPS); + EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_u, EPS); + EXPECT_NEAR_KK_1DVIEW(U_values, U_values_u, EPS); + } + } - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + static void run_test_spiluk() { + std::vector> A = get_9x9_fixture(); - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + RowMapType row_map; + EntriesType entries; + ValuesType values; - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + compress_matrix(row_map, entries, values, A); - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + const lno_t fill_lev = 2; - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + KernelHandle kh; - kh.destroy_spiluk_handle(); + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); } -} - -template -void run_test_spiluk_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - using crsMat_t = CrsMatrix; - using AT = Kokkos::ArithTraits; - // Workaround for OpenMP: skip tests if concurrency < nstreams because of - // not enough resource to partition - bool run_streams_test = true; + static void run_test_spiluk_streams(SPILUKAlgorithm test_algo, int nstreams) { + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; #ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - int exec_concurrency = execution_space().concurrency(); - if (exec_concurrency < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: concurrency = " << exec_concurrency - << std::endl; + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } } - } #endif - if (!run_streams_test) return; - - const size_type nrows = 9; - const size_type nnz = 21; - - std::vector instances; - if (nstreams == 1) - instances = Kokkos::Experimental::partition_space(execution_space(), 1); - else if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); - - std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); - std::vector A_row_map_v(nstreams); - std::vector A_entries_v(nstreams); - std::vector A_values_v(nstreams); - std::vector L_row_map_v(nstreams); - std::vector L_entries_v(nstreams); - std::vector L_values_v(nstreams); - std::vector U_row_map_v(nstreams); - std::vector U_entries_v(nstreams); - std::vector U_values_v(nstreams); - - RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); - EntriesType_hostmirror hentries("hentries", nnz); - ValuesType_hostmirror hvalues("hvalues", nnz); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - scalar_t MONE = scalar_t(-1); - - hrow_map(0) = 0; - hrow_map(1) = 3; - hrow_map(2) = 5; - hrow_map(3) = 6; - hrow_map(4) = 9; - hrow_map(5) = 11; - hrow_map(6) = 13; - hrow_map(7) = 15; - hrow_map(8) = 18; - hrow_map(9) = nnz; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 5; - hentries(3) = 1; - hentries(4) = 6; - hentries(5) = 2; - hentries(6) = 0; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 0; - hentries(10) = 4; - hentries(11) = 1; - hentries(12) = 5; - hentries(13) = 2; - hentries(14) = 6; - hentries(15) = 3; - hentries(16) = 4; - hentries(17) = 7; - hentries(18) = 3; - hentries(19) = 4; - hentries(20) = 8; - - hvalues(0) = 10; - hvalues(1) = 0.3; - hvalues(2) = 0.6; - hvalues(3) = 11; - hvalues(4) = 0.7; - hvalues(5) = 12; - hvalues(6) = 5; - hvalues(7) = 13; - hvalues(8) = 1; - hvalues(9) = 4; - hvalues(10) = 14; - hvalues(11) = 3; - hvalues(12) = 15; - hvalues(13) = 7; - hvalues(14) = 16; - hvalues(15) = 6; - hvalues(16) = 5; - hvalues(17) = 17; - hvalues(18) = 2; - hvalues(19) = 2.5; - hvalues(20) = 18; - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; - - for (int i = 0; i < nstreams; i++) { - // Allocate A as input - A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); - A_entries_v[i] = EntriesType("A_entries", nnz); - A_values_v[i] = ValuesType("A_values", nnz); - - // Copy from host to device - Kokkos::deep_copy(A_row_map_v[i], hrow_map); - Kokkos::deep_copy(A_entries_v[i], hentries); - Kokkos::deep_copy(A_values_v[i], hvalues); - - // Create handle - kh_v[i] = KernelHandle(); - if (test_algo == 0) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, - 4 * nrows, 4 * nrows); - else if (test_algo == 1) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, - 4 * nrows, 4 * nrows); - kh_ptr_v[i] = &kh_v[i]; - - auto spiluk_handle = kh_v[i].get_spiluk_handle(); - std::cout << " Stream " << i << ": "; - spiluk_handle->print_algorithm(); + if (!run_streams_test) return; - // Allocate L and U as outputs - L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); - L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); - L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); - U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); - U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); - U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); - - // Symbolic phase - spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], - L_row_map_v[i], L_entries_v[i], U_row_map_v[i], - U_entries_v[i], nstreams); + std::vector weights(nstreams, 1); + std::vector instances = + Kokkos::Experimental::partition_space(execution_space(), weights); - Kokkos::fence(); + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector A_row_map_v(nstreams); + std::vector A_entries_v(nstreams); + std::vector A_values_v(nstreams); + std::vector L_row_map_v(nstreams); + std::vector L_entries_v(nstreams); + std::vector L_values_v(nstreams); + std::vector U_row_map_v(nstreams); + std::vector U_entries_v(nstreams); + std::vector U_values_v(nstreams); - Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); - Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); - Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); - } // Done handle creation and spiluk_symbolic on all streams - - // Numeric phase - spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, - A_entries_v, A_values_v, L_row_map_v, L_entries_v, - L_values_v, U_row_map_v, U_entries_v, U_values_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - auto spiluk_handle = kh_v[i].get_spiluk_handle(); - crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], - A_entries_v[i]); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], - L_row_map_v[i], L_entries_v[i]); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], - U_row_map_v[i], U_entries_v[i]); + std::vector> Afix = get_9x9_fixture(); - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); + RowMapType row_map; + EntriesType entries; + ValuesType values; - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); + compress_matrix(row_map, entries, values, Afix); - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + const size_type nrows = Afix.size(); + const size_type nnz = values.extent(0); - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + Kokkos::deep_copy(hrow_map, row_map); + Kokkos::deep_copy(hentries, entries); + Kokkos::deep_copy(hvalues, values); - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + typename KernelHandle::const_nnz_lno_t fill_lev = 2; - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + for (int i = 0; i < nstreams; i++) { + // Allocate A as input + A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); + A_entries_v[i] = EntriesType("A_entries", nnz); + A_values_v[i] = ValuesType("A_values", nnz); - kh_v[i].destroy_spiluk_handle(); + // Copy from host to device + Kokkos::deep_copy(A_row_map_v[i], hrow_map); + Kokkos::deep_copy(A_entries_v[i], hentries); + Kokkos::deep_copy(A_values_v[i], hvalues); + + // Create handle + kh_v[i] = KernelHandle(); + kh_v[i].create_spiluk_handle(test_algo, nrows, 4 * nrows, 4 * nrows); + kh_ptr_v[i] = &kh_v[i]; + + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + + // Allocate L and U as outputs + L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); + L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); + U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); + U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); + + // Symbolic phase + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], + L_row_map_v[i], L_entries_v[i], U_row_map_v[i], + U_entries_v[i], nstreams); + + Kokkos::fence(); + + Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); + L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); + U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); + } // Done handle creation and spiluk_symbolic on all streams + + // Numeric phase + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, + A_entries_v, A_values_v, L_row_map_v, L_entries_v, + L_values_v, U_row_map_v, U_entries_v, U_values_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i]); + + kh_v[i].destroy_spiluk_handle(); + } } -} + + static void run_test_spiluk_streams_blocks(SPILUKAlgorithm test_algo, + int nstreams) { + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } + } +#endif + if (!run_streams_test) return; + + std::vector weights(nstreams, 1); + std::vector instances = + Kokkos::Experimental::partition_space(execution_space(), weights); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector A_row_map_v(nstreams); + std::vector A_entries_v(nstreams); + std::vector A_values_v(nstreams); + std::vector L_row_map_v(nstreams); + std::vector L_entries_v(nstreams); + std::vector L_values_v(nstreams); + std::vector U_row_map_v(nstreams); + std::vector U_entries_v(nstreams); + std::vector U_values_v(nstreams); + + std::vector> Afix = get_9x9_fixture(); + + RowMapType row_map, brow_map; + EntriesType entries, bentries; + ValuesType values, bvalues; + + compress_matrix(row_map, entries, values, Afix); + + const size_type nrows = Afix.size(); + const size_type block_size = nrows % 2 == 0 ? 2 : 3; + const size_type block_items = block_size * block_size; + ASSERT_EQ(nrows % block_size, 0); + + // Convert to BSR + Crs crs("crs for block spiluk test", nrows, nrows, values.extent(0), values, + row_map, entries); + Bsr bsr(crs, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + const size_type bnrows = brow_map.extent(0) - 1; + const size_type bnnz = bentries.extent(0); + + RowMapType_hostmirror hrow_map("hrow_map", bnrows + 1); + EntriesType_hostmirror hentries("hentries", bnnz); + ValuesType_hostmirror hvalues("hvalues", bnnz * block_items); + + Kokkos::deep_copy(hrow_map, brow_map); + Kokkos::deep_copy(hentries, bentries); + Kokkos::deep_copy(hvalues, bvalues); + + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + for (int i = 0; i < nstreams; i++) { + // Allocate A as input + A_row_map_v[i] = RowMapType("A_row_map", bnrows + 1); + A_entries_v[i] = EntriesType("A_entries", bnnz); + A_values_v[i] = ValuesType("A_values", bnnz * block_items); + + // Copy from host to device + Kokkos::deep_copy(A_row_map_v[i], hrow_map); + Kokkos::deep_copy(A_entries_v[i], hentries); + Kokkos::deep_copy(A_values_v[i], hvalues); + + // Create handle + kh_v[i] = KernelHandle(); + kh_v[i].create_spiluk_handle(test_algo, bnrows, 4 * bnrows, 4 * bnrows, + block_size); + kh_ptr_v[i] = &kh_v[i]; + + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + + // Allocate L and U as outputs + L_row_map_v[i] = RowMapType("L_row_map", bnrows + 1); + L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); + U_row_map_v[i] = RowMapType("U_row_map", bnrows + 1); + U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); + + // Symbolic phase + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], + L_row_map_v[i], L_entries_v[i], U_row_map_v[i], + U_entries_v[i], nstreams); + + Kokkos::fence(); + + Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); + L_values_v[i] = + ValuesType("L_values", spiluk_handle->get_nnzL() * block_items); + U_values_v[i] = + ValuesType("U_values", spiluk_handle->get_nnzU() * block_items); + } // Done handle creation and spiluk_symbolic on all streams + + // Numeric phase + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, + A_entries_v, A_values_v, L_row_map_v, L_entries_v, + L_values_v, U_row_map_v, U_entries_v, U_values_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + check_result_block(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + block_size); + + kh_v[i].destroy_spiluk_handle(); + } + } + + static void run_test_spiluk_blocks() { + std::vector> A = get_9x9_fixture(); + + RowMapType row_map, brow_map; + EntriesType entries, bentries; + ValuesType values, bvalues; + + compress_matrix(row_map, entries, values, A); + + const size_type nrows = A.size(); + const size_type nnz = values.extent(0); + const lno_t fill_lev = 2; + const size_type block_size = nrows % 2 == 0 ? 2 : 3; + ASSERT_EQ(nrows % block_size, 0); + + KernelHandle kh; + + // Check block_size=1 produces identical result to unblocked + run_and_check_spiluk_block(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev, 1); + run_and_check_spiluk_block(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, 1); + + // Convert to BSR + Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, + entries); + Bsr bsr(crs, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev, + block_size); + run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); + } +}; } // namespace Test template void test_spiluk() { - Test::run_test_spiluk(); + using TestStruct = Test::SpilukTest; + TestStruct::run_test_spiluk(); + TestStruct::run_test_spiluk_blocks(); } template void test_spiluk_streams() { - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; - Test::run_test_spiluk_streams(0, 1); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; - Test::run_test_spiluk_streams(0, 2); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; - Test::run_test_spiluk_streams(0, 3); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; - Test::run_test_spiluk_streams(0, 4); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; - Test::run_test_spiluk_streams(1, 1); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; - Test::run_test_spiluk_streams(1, 2); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; - Test::run_test_spiluk_streams(1, 3); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; - Test::run_test_spiluk_streams(1, 4); + using TestStruct = Test::SpilukTest; + + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 1); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 2); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 3); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 4); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); + + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 1); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 2); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 3); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 4); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 1); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 2); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 3); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 4); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ diff --git a/sparse/unit_test/Test_vector_fixtures.hpp b/sparse/unit_test/Test_vector_fixtures.hpp new file mode 100644 index 0000000000..21b155970d --- /dev/null +++ b/sparse/unit_test/Test_vector_fixtures.hpp @@ -0,0 +1,194 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _TEST_VECTOR_FIXTURES_HPP +#define _TEST_VECTOR_FIXTURES_HPP + +#include + +#include + +/** + * API for working with 2D vectors of small matrices for testing. + */ + +namespace Test { + +template +void compress_matrix( + RowMapT& row_map, EntriesT& entries, ValuesT& values, + const std::vector>& + fixture) { + using size_type = typename RowMapT::non_const_value_type; + using scalar_t = typename ValuesT::non_const_value_type; + + const scalar_t ZERO = scalar_t(0); + + const size_type nrows = fixture.size(); + + // Count fixture nnz's + size_type nnz = 0; + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { + if (fixture[row_idx][col_idx] != ZERO) { + ++nnz; + } + } + } + + // Allocate device CRS views + Kokkos::resize(row_map, nrows + 1); + Kokkos::resize(entries, nnz); + Kokkos::resize(values, nnz); + + // Create host mirror views for CRS + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + // Compress into CRS (host views) + size_type curr_nnz = 0; + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { + if (fixture[row_idx][col_idx] != ZERO) { + hentries(curr_nnz) = col_idx; + hvalues(curr_nnz) = fixture[row_idx][col_idx]; + ++curr_nnz; + } + hrow_map(row_idx + 1) = curr_nnz; + } + } + + // Copy host CRS views to device CRS views + Kokkos::deep_copy(row_map, hrow_map); + Kokkos::deep_copy(entries, hentries); + Kokkos::deep_copy(values, hvalues); +} + +template +std::vector> +decompress_matrix(const RowMapT& row_map, const EntriesT& entries, + const ValuesT& values) { + using size_type = typename RowMapT::non_const_value_type; + using scalar_t = typename ValuesT::non_const_value_type; + + const scalar_t ZERO = scalar_t(0); + + const size_type nrows = row_map.size() - 1; + std::vector> result; + result.resize(nrows); + for (auto& row : result) { + row.resize(nrows, ZERO); + } + + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + Kokkos::deep_copy(hrow_map, row_map); + Kokkos::deep_copy(hentries, entries); + Kokkos::deep_copy(hvalues, values); + + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + const size_type row_nnz_begin = hrow_map(row_idx); + const size_type row_nnz_end = hrow_map(row_idx + 1); + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { + const auto col_idx = hentries(row_nnz); + const scalar_t value = hvalues(row_nnz); + result[row_idx][col_idx] = value; + } + } + + return result; +} + +template +std::vector> +decompress_matrix(const RowMapT& row_map, const EntriesT& entries, + const ValuesT& values, const int block_size) { + using size_type = typename RowMapT::non_const_value_type; + using scalar_t = typename ValuesT::non_const_value_type; + + const scalar_t ZERO = scalar_t(0); + + const size_type nbrows = row_map.extent(0) - 1; + const size_type nrows = nbrows * block_size; + const size_type block_items = block_size * block_size; + std::vector> result; + result.resize(nrows); + for (auto& row : result) { + row.resize(nrows, ZERO); + } + + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + Kokkos::deep_copy(hrow_map, row_map); + Kokkos::deep_copy(hentries, entries); + Kokkos::deep_copy(hvalues, values); + + for (size_type row_idx = 0; row_idx < nbrows; ++row_idx) { + const size_type row_nnz_begin = hrow_map(row_idx); + const size_type row_nnz_end = hrow_map(row_idx + 1); + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { + const auto col_idx = hentries(row_nnz); + for (size_type i = 0; i < block_size; ++i) { + const size_type unc_row_idx = row_idx * block_size + i; + for (size_type j = 0; j < block_size; ++j) { + const size_type unc_col_idx = col_idx * block_size + j; + result[unc_row_idx][unc_col_idx] = + hvalues(row_nnz * block_items + i * block_size + j); + } + } + } + } + + return result; +} + +template +void check_matrix( + const std::string& name, const RowMapT& row_map, const EntriesT& entries, + const ValuesT& values, + const std::vector>& + expected) { + using size_type = typename RowMapT::non_const_value_type; + + const auto decompressed_mtx = decompress_matrix(row_map, entries, values); + + const size_type nrows = row_map.size() - 1; + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { + EXPECT_NEAR(expected[row_idx][col_idx], + decompressed_mtx[row_idx][col_idx], 0.01) + << "Failed check is: " << name << "[" << row_idx << "][" << col_idx + << "]"; + } + } +} + +template +void print_matrix(const std::vector>& matrix) { + for (const auto& row : matrix) { + for (const auto& item : row) { + std::printf("%.2f ", item); + } + std::cout << std::endl; + } +} + +} // namespace Test + +#endif // _TEST_VECTOR_FIXTURES_HPP From c34c6c552bc49a6810858002c9399b582d72475f Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Tue, 16 Jan 2024 14:04:49 -0600 Subject: [PATCH 147/326] Add CUDA/HIP TPL support for KokkosSparse::spadd (#1962) * spadd: change arguments to ctor of SPADDHandle add a default value to input_sorted; add a second argument input_merged to indicate unqiue entries; So that we can easily know whether we can use TPLs on the input matrices * spadd: add cuda/rocm TPL support for spadd_symbolic/numeric --- perf_test/sparse/KokkosSparse_spadd.cpp | 10 +- .../KokkosSparse_par_ilut_numeric_impl.hpp | 22 +- .../impl/KokkosSparse_spadd_numeric_impl.hpp | 31 +- .../impl/KokkosSparse_spadd_numeric_spec.hpp | 62 ++-- .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 114 ++++--- .../impl/KokkosSparse_spadd_symbolic_spec.hpp | 52 ++-- sparse/src/KokkosKernels_Handle.hpp | 23 +- sparse/src/KokkosSparse_spadd.hpp | 242 ++++++++++----- sparse/src/KokkosSparse_spadd_handle.hpp | 53 +++- ...kkosSparse_spadd_numeric_tpl_spec_decl.hpp | 282 ++++++++++++++++++ ...kosSparse_spadd_symbolic_tpl_spec_decl.hpp | 238 +++++++++++++++ .../KokkosSparse_spadd_tpl_spec_avail.hpp | 117 +++++++- .../tpls/KokkosSparse_spadd_tpl_spec_decl.hpp | 24 -- sparse/unit_test/Test_Sparse_spadd.hpp | 23 +- 14 files changed, 1029 insertions(+), 264 deletions(-) create mode 100644 sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp create mode 100644 sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp delete mode 100644 sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 3b347eb903..a785ea82f6 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -303,8 +303,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { double numericTime = 0; // Do an untimed warm up symbolic, and preallocate space for C entries/values - spadd_symbolic(&kh, A.graph.row_map, A.graph.entries, B.graph.row_map, - B.graph.entries, row_mapC); + spadd_symbolic(exec_space{}, &kh, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); bool use_kk = !params.use_cusparse && !params.use_mkl; @@ -366,7 +366,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { for (int sumRep = 0; sumRep < params.repeat; sumRep++) { timer.reset(); if (use_kk) { - spadd_symbolic(&kh, A.graph.row_map, A.graph.entries, B.graph.row_map, + spadd_symbolic(exec_space{}, &kh, A.numRows(), A.numCols(), + A.graph.row_map, A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); c_nnz = addHandle->get_c_nnz(); } else if (params.use_cusparse) { @@ -434,7 +435,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { } #endif } else { - spadd_numeric(&kh, A.graph.row_map, A.graph.entries, A.values, + spadd_numeric(exec_space{}, &kh, A.numRows(), A.numCols(), + A.graph.row_map, A.graph.entries, A.values, 1.0, // A, alpha B.graph.row_map, B.graph.entries, B.values, 1.0, // B, beta diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 9375039747..6bdf0eb577 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -699,18 +699,24 @@ struct IlutWrap { multiply_matrices(kh, ih, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, LU_row_map, LU_entries, LU_values); - auto addHandle = kh.get_spadd_handle(); - KokkosSparse::Experimental::spadd_symbolic( - &kh, A_row_map, A_entries, LU_row_map, LU_entries, R_row_map); + auto addHandle = kh.get_spadd_handle(); + typename KHandle::const_nnz_lno_t m = A_row_map.extent(0) - 1, + n = m; // square matrix + // TODO: let compute_residual_norm also take an execution space argument and + // use that for exec! + typename KHandle::HandleExecSpace exec{}; + KokkosSparse::Experimental::spadd_symbolic(exec, &kh, m, n, A_row_map, + A_entries, LU_row_map, + LU_entries, R_row_map); const size_type r_nnz = addHandle->get_c_nnz(); - Kokkos::resize(R_entries, r_nnz); - Kokkos::resize(R_values, r_nnz); + Kokkos::resize(exec, R_entries, r_nnz); + Kokkos::resize(exec, R_values, r_nnz); KokkosSparse::Experimental::spadd_numeric( - &kh, A_row_map, A_entries, A_values, 1., LU_row_map, LU_entries, - LU_values, -1., R_row_map, R_entries, R_values); - + exec, &kh, m, n, A_row_map, A_entries, A_values, 1., LU_row_map, + LU_entries, LU_values, -1., R_row_map, R_entries, R_values); + // TODO: how to make this policy use exec? auto policy = ih.get_default_team_policy(); Kokkos::parallel_reduce( diff --git a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp index 8e70cd3c3b..fa356dc963 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp @@ -174,24 +174,23 @@ struct UnsortedNumericSumFunctor { std::is_same::type, \ typename std::remove_const::type>::value -template +template < + typename execution_space, typename KernelHandle, typename alno_row_view_t, + typename alno_nnz_view_t, typename ascalar_t, typename ascalar_nnz_view_t, + typename blno_row_view_t, typename blno_nnz_view_t, typename bscalar_t, + typename bscalar_nnz_view_t, typename clno_row_view_t, + typename clno_nnz_view_t, typename cscalar_nnz_view_t> void spadd_numeric_impl( - KernelHandle* kernel_handle, const alno_row_view_t a_rowmap, - const alno_nnz_view_t a_entries, const ascalar_nnz_view_t a_values, - const ascalar_t alpha, const blno_row_view_t b_rowmap, - const blno_nnz_view_t b_entries, const bscalar_nnz_view_t b_values, - const bscalar_t beta, const clno_row_view_t c_rowmap, - clno_nnz_view_t c_entries, cscalar_nnz_view_t c_values) { + const execution_space& exec, KernelHandle* kernel_handle, + const alno_row_view_t a_rowmap, const alno_nnz_view_t a_entries, + const ascalar_nnz_view_t a_values, const ascalar_t alpha, + const blno_row_view_t b_rowmap, const blno_nnz_view_t b_entries, + const bscalar_nnz_view_t b_values, const bscalar_t beta, + const clno_row_view_t c_rowmap, clno_nnz_view_t c_entries, + cscalar_nnz_view_t c_values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; // Check that A/B/C data types match KernelHandle types, and that C data types // are nonconst (doesn't matter if A/B types are const) static_assert(SAME_TYPE(ascalar_t, scalar_type), @@ -252,7 +251,7 @@ void spadd_numeric_impl( sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, c_entries, a_values, b_values, c_values, alpha, beta); Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", - range_type(0, nrows), sortedNumeric); + range_type(exec, 0, nrows), sortedNumeric); } else { // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C // entries and values @@ -265,7 +264,7 @@ void spadd_numeric_impl( c_entries, a_values, b_values, c_values, alpha, beta, addHandle->get_a_pos(), addHandle->get_b_pos()); Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", - range_type(0, nrows), unsortedNumeric); + range_type(exec, 0, nrows), unsortedNumeric); } addHandle->set_call_numeric(); } diff --git a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index e81649f552..18731348de 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -28,10 +28,10 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spadd_numeric_eti_spec_avail { enum : bool { value = false }; }; @@ -44,6 +44,7 @@ struct spadd_numeric_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spadd_numeric_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -87,20 +88,22 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) -template ::value, + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + a_scalar_view_t, b_size_view_t, b_lno_view_t, b_scalar_view_t, + c_size_view_t, c_lno_view_t, c_scalar_view_t>::value, bool eti_spec_avail = spadd_numeric_eti_spec_avail< - KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, - b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, - c_lno_view_t, c_scalar_view_t>::value> + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + a_scalar_view_t, b_size_view_t, b_lno_view_t, b_scalar_view_t, + c_size_view_t, c_lno_view_t, c_scalar_view_t>::value> struct SPADD_NUMERIC { - static void spadd_numeric(KernelHandle *handle, + static void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, typename a_scalar_view_t::const_value_type alpha, a_size_view_t row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, @@ -112,15 +115,17 @@ struct SPADD_NUMERIC { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct SPADD_NUMERIC { - static void spadd_numeric(KernelHandle *handle, +template +struct SPADD_NUMERIC< + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, + b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, c_lno_view_t, + c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> { + static void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t /* m */, + typename KernelHandle::const_nnz_lno_t /* n */, typename a_scalar_view_t::const_value_type alpha, a_size_view_t row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, @@ -128,8 +133,9 @@ struct SPADD_NUMERIC, \ @@ -178,6 +185,7 @@ struct SPADD_NUMERIC, \ @@ -210,6 +218,6 @@ struct SPADD_NUMERIC >, \ false, true>; -#include +#include #endif diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 15132f9da3..80506e3056 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -371,50 +371,48 @@ struct MergeEntriesFunctor { }; // Run SortedCountEntries: non-GPU, always uses the RangePolicy version. -template +template void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = + const execution_space& exec, const alno_row_view_t_& a_rowmap, + const alno_nnz_view_t_& a_entries, const blno_row_view_t_& b_rowmap, + const blno_nnz_view_t_& b_entries, const clno_row_view_t_& c_rowmap, + typename std::enable_if< + !KokkosKernels::Impl::kk_is_gpu_exec_space()>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using range_type = Kokkos::RangePolicy; - auto nrows = c_rowmap.extent(0) - 1; + using range_type = Kokkos::RangePolicy; + auto nrows = c_rowmap.extent(0) - 1; SortedCountEntriesRange countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); Kokkos::parallel_for( "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - range_type(0, nrows), countEntries); + range_type(exec, 0, nrows), countEntries); } // Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending // on average nz per row (a runtime decision) -template +template void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = + const execution_space& exec, const alno_row_view_t_& a_rowmap, + const alno_nnz_view_t_& a_entries, const blno_row_view_t_& b_rowmap, + const blno_nnz_view_t_& b_entries, const clno_row_view_t_& c_rowmap, + typename std::enable_if< + KokkosKernels::Impl::kk_is_gpu_exec_space()>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using RangePol = Kokkos::RangePolicy; - using TeamPol = Kokkos::TeamPolicy; - auto nrows = c_rowmap.extent(0) - 1; + using RangePol = Kokkos::RangePolicy; + using TeamPol = Kokkos::TeamPolicy; + auto nrows = c_rowmap.extent(0) - 1; size_type c_est_nnz = 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; if (c_est_nnz <= 512) { @@ -435,14 +433,14 @@ void runSortedCountEntries( countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); countEntries.sharedPerThread = pot_est_nnz; // compute largest possible team size - TeamPol testPolicy(1, 1, vector_length); + TeamPol testPolicy(exec, 1, 1, vector_length); testPolicy.set_scratch_size( 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); int team_size = testPolicy.team_size_recommended(countEntries, Kokkos::ParallelForTag()); // construct real policy int league_size = (nrows + team_size - 1) / team_size; - TeamPol policy(league_size, team_size, vector_length); + TeamPol policy(exec, league_size, team_size, vector_length); policy.set_scratch_size( 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); countEntries.totalShared = @@ -457,24 +455,23 @@ void runSortedCountEntries( countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); Kokkos::parallel_for( "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - RangePol(0, nrows), countEntries); + RangePol(exec, 0, nrows), countEntries); } } // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template +template void spadd_symbolic_impl( - KernelHandle* handle, const alno_row_view_t_ a_rowmap, - const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, - const blno_nnz_view_t_ b_entries, + const execution_space& exec, KernelHandle* handle, + const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, + const blno_row_view_t_ b_rowmap, const blno_nnz_view_t_ b_entries, clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't // need to be initialized) { - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::SPADDHandleType::nnz_lno_view_t ordinal_view_t; @@ -520,17 +517,18 @@ void spadd_symbolic_impl( ordinal_type nrows = a_rowmap.extent(0) - 1; typedef Kokkos::RangePolicy range_type; if (addHandle->is_input_sorted()) { - runSortedCountEntries( - a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + runSortedCountEntries(exec, a_rowmap, a_entries, b_rowmap, + b_entries, c_rowmap); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); + exec, nrows + 1, c_rowmap); } else { // note: scoping individual parts of the process to free views sooner, // minimizing peak memory usage run the unsorted c_rowmap upper bound // functor (just adds together A and B entry counts row by row) offset_view_t c_rowmap_upperbound( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "C row counts upper bound"), nrows + 1); size_type c_nnz_upperbound = 0; @@ -540,17 +538,17 @@ void spadd_symbolic_impl( countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound); Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", - range_type(0, nrows), countEntries); + range_type(exec, 0, nrows), countEntries); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap_upperbound); - Kokkos::deep_copy(c_nnz_upperbound, + exec, nrows + 1, c_rowmap_upperbound); + Kokkos::deep_copy(exec, c_nnz_upperbound, Kokkos::subview(c_rowmap_upperbound, nrows)); } ordinal_view_t c_entries_uncompressed( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "C entries uncompressed"), c_nnz_upperbound); - ordinal_view_t ab_perm(Kokkos::view_alloc(Kokkos::WithoutInitializing, + ordinal_view_t ab_perm(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "A and B permuted entry indices"), c_nnz_upperbound); // compute the unmerged sum @@ -561,17 +559,17 @@ void spadd_symbolic_impl( c_rowmap_upperbound, c_entries_uncompressed, ab_perm); Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", - range_type(0, nrows), unmergedSum); + range_type(exec, 0, nrows), unmergedSum); // sort the unmerged sum KokkosSparse::sort_crs_matrix( - c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - ordinal_view_t a_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), - a_entries.extent(0)); - ordinal_view_t b_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"), - b_entries.extent(0)); + exec, c_rowmap_upperbound, c_entries_uncompressed, ab_perm); + ordinal_view_t a_pos(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "A entry positions"), + a_entries.extent(0)); + ordinal_view_t b_pos(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "B entry positions"), + b_entries.extent(0)); // merge the entries and compute Apos/Bpos, as well as Crowcounts { MergeEntriesFunctor( - nrows + 1, c_rowmap); + exec, nrows + 1, c_rowmap); } addHandle->set_a_b_pos(a_pos, b_pos); } // provide the number of NNZ in C to user through handle size_type cmax; - Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows)); + Kokkos::deep_copy(exec, cmax, Kokkos::subview(c_rowmap, nrows)); addHandle->set_c_nnz(cmax); addHandle->set_call_symbolic(); addHandle->set_call_numeric(false); diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index aaab68568a..bdc4ed04bd 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -28,8 +28,9 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spadd_symbolic_eti_spec_avail { enum : bool { value = false }; }; @@ -42,6 +43,7 @@ struct spadd_symbolic_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spadd_symbolic_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -73,31 +75,39 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) -template ::value, + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + b_size_view_t, b_lno_view_t, c_size_view_t>::value, bool eti_spec_avail = spadd_symbolic_eti_spec_avail< - KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t, - b_lno_view_t, c_size_view_t>::value> + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + b_size_view_t, b_lno_view_t, c_size_view_t>::value> struct SPADD_SYMBOLIC { - static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, - a_lno_view_t entriesA, b_size_view_t row_mapB, - b_lno_view_t entriesB, c_size_view_t row_mapC); + static void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + a_size_view_t row_mapA, a_lno_view_t entriesA, + b_size_view_t row_mapB, b_lno_view_t entriesB, + c_size_view_t row_mapC); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct SPADD_SYMBOLIC +struct SPADD_SYMBOLIC { - static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, - a_lno_view_t entriesA, b_size_view_t row_mapB, - b_lno_view_t entriesB, c_size_view_t row_mapC) { - spadd_symbolic_impl(handle, row_mapA, entriesA, row_mapB, entriesB, + static void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t /* m */, + typename KernelHandle::const_nnz_lno_t /* n */, + a_size_view_t row_mapA, a_lno_view_t entriesA, + b_size_view_t row_mapB, b_lno_view_t entriesB, + c_size_view_t row_mapC) { + spadd_symbolic_impl(exec, handle, row_mapA, entriesA, row_mapB, entriesB, row_mapC); } }; @@ -111,6 +121,7 @@ struct SPADD_SYMBOLIC, \ @@ -135,6 +146,7 @@ struct SPADD_SYMBOLIC, \ @@ -155,6 +167,6 @@ struct SPADD_SYMBOLIC >, \ false, true>; -#include +#include #endif diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 6a5b5b6f20..680045823e 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -605,18 +605,18 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: - * + * * KokkosSpace::GS_DEFAULT PointGaussSeidel * KokkosSpace::GS_PERMUTED ?? * KokkosSpace::GS_TEAM ?? * KokkosSpace::GS_CLUSTER ?? * KokkosSpace::GS_TWOSTAGE ?? * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT ?? * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring @@ -649,9 +649,9 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param gs_algorithm Specifies which algorithm to use: - * + * * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. @@ -660,7 +660,7 @@ class KokkosKernelsHandle { * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT Depends on execution space: * COLORING_SERIAL on Kokkos::Serial; * COLORING_EB on GPUs; @@ -744,16 +744,16 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gs handle object - * + * * @param clusterAlgo Specifies which clustering algorithm to use: - * + * * KokkosSparse::CLUSTER_DEFAULT ?? * KokkosSparse::CLUSTER_MIS2 ?? * KokkosSparse::CLUSTER_BALLOON ?? * KokkosSparse::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT ?? * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring @@ -821,10 +821,11 @@ class KokkosKernelsHandle { // ---------------------------------------- // SPADDHandleType *get_spadd_handle() { return this->spaddHandle; } - void create_spadd_handle(bool input_sorted) { + void create_spadd_handle(bool input_sorted = false, + bool input_merged = false) { this->destroy_spadd_handle(); this->is_owner_of_the_spadd_handle = true; - this->spaddHandle = new SPADDHandleType(input_sorted); + this->spaddHandle = new SPADDHandleType(input_sorted, input_merged); } void destroy_spadd_handle() { if (is_owner_of_the_spadd_handle && this->spaddHandle != NULL) { diff --git a/sparse/src/KokkosSparse_spadd.hpp b/sparse/src/KokkosSparse_spadd.hpp index 74efed66bc..4151ea6783 100644 --- a/sparse/src/KokkosSparse_spadd.hpp +++ b/sparse/src/KokkosSparse_spadd.hpp @@ -19,25 +19,27 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_helpers.hpp" -#include "KokkosSparse_spadd_symbolic_spec.hpp" +#include "KokkosBlas1_scal.hpp" #include "KokkosSparse_spadd_numeric_spec.hpp" +#include "KokkosSparse_spadd_symbolic_spec.hpp" namespace KokkosSparse { namespace Experimental { // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template void spadd_symbolic( - KernelHandle* handle, const alno_row_view_t_ a_rowmap, + const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, // same type as column indices + typename KernelHandle::const_nnz_lno_t n, const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, const blno_nnz_view_t_ b_entries, clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't // need to be initialized) { - typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; @@ -51,49 +53,69 @@ void spadd_symbolic( ConstKernelHandle; ConstKernelHandle tmp_handle(*handle); - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_rowmap; - KokkosSparse::Impl::SPADD_SYMBOLIC:: - spadd_symbolic(&tmp_handle, - Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), - Internal_a_entries(a_entries.data(), a_entries.extent(0)), - Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), - Internal_b_entries(b_entries.data(), b_entries.extent(0)), - Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + + auto addHandle = handle->get_spadd_handle(); + bool useFallback = !addHandle->is_input_strict_crs(); + if (useFallback) { + KokkosSparse::Impl::SPADD_SYMBOLIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_b_rowmap, Internal_b_entries, Internal_c_rowmap, false>:: + spadd_symbolic( + exec, &tmp_handle, m, n, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + } else { + KokkosSparse::Impl::SPADD_SYMBOLIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_b_rowmap, Internal_b_entries, Internal_c_rowmap>:: + spadd_symbolic( + exec, &tmp_handle, m, n, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + } } -template -void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, +void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const ascalar_nnz_view_t_ a_values, const ascalar_t_ alpha, const blno_row_view_t_ b_rowmap, @@ -101,7 +123,6 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, const bscalar_nnz_view_t_ b_values, const bscalar_t_ beta, const clno_row_view_t_ c_rowmap, clno_nnz_view_t_ c_entries, cscalar_nnz_view_t_ c_values) { - typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; @@ -113,116 +134,177 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> ConstKernelHandle; - ConstKernelHandle tmp_handle(*handle); + ConstKernelHandle tmp_handle(*handle); // handle->exec_space is also copied - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_values; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_values; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_values; - KokkosSparse::Impl::SPADD_NUMERIC:: - spadd_numeric(&tmp_handle, alpha, - Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), - Internal_a_entries(a_entries.data(), a_entries.extent(0)), - Internal_a_values(a_values.data(), a_values.extent(0)), - beta, - Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), - Internal_b_entries(b_entries.data(), b_entries.extent(0)), - Internal_b_values(b_values.data(), b_values.extent(0)), - Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), - Internal_c_entries(c_entries.data(), c_entries.extent(0)), - Internal_c_values(c_values.data(), c_values.extent(0))); + + auto addHandle = handle->get_spadd_handle(); + bool useFallback = !addHandle->is_input_strict_crs(); + if (useFallback) { + KokkosSparse::Impl::SPADD_NUMERIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_a_values, Internal_b_rowmap, Internal_b_entries, + Internal_b_values, Internal_c_rowmap, Internal_c_entries, + Internal_c_values, false>:: + spadd_numeric(exec, &tmp_handle, m, n, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); + } else { + KokkosSparse::Impl::SPADD_NUMERIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_a_values, Internal_b_rowmap, Internal_b_entries, + Internal_b_values, Internal_c_rowmap, Internal_c_entries, + Internal_c_values>:: + spadd_numeric(exec, &tmp_handle, m, n, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); + } } } // namespace Experimental // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template -void spadd_symbolic(KernelHandle* handle, const AMatrix& A, const BMatrix& B, - CMatrix& C) { +template +void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + const AMatrix &A, const BMatrix &B, CMatrix &C) { using row_map_type = typename CMatrix::row_map_type::non_const_type; using entries_type = typename CMatrix::index_type::non_const_type; using values_type = typename CMatrix::values_type::non_const_type; + auto addHandle = handle->get_spadd_handle(); + // Create the row_map of C, no need to initialize it row_map_type row_mapC( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "row map"), + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "row map"), A.numRows() + 1); - KokkosSparse::Experimental::spadd_symbolic(handle, A.graph.row_map, - A.graph.entries, B.graph.row_map, - B.graph.entries, row_mapC); + + // Shortcuts for special cases as they cause errors in some TPL + // implementations (e.g., cusparse and hipsparse) + if (!A.nnz()) { + Kokkos::deep_copy(exec, row_mapC, B.graph.row_map); + addHandle->set_c_nnz(B.graph.entries.extent(0)); + } else if (!B.nnz()) { + Kokkos::deep_copy(exec, row_mapC, A.graph.row_map); + addHandle->set_c_nnz(A.graph.entries.extent(0)); + } else { + KokkosSparse::Experimental::spadd_symbolic( + exec, handle, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); + } // Now create and allocate the entries and values // views so we can build a graph and then matrix C // and subsequently construct C. - auto addHandle = handle->get_spadd_handle(); entries_type entriesC( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "entries"), addHandle->get_c_nnz()); // Finally since we already have the number of nnz handy // we can go ahead and allocate C's values and set them. - values_type valuesC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), - addHandle->get_c_nnz()); + values_type valuesC( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "values"), + addHandle->get_c_nnz()); C = CMatrix("matrix", A.numRows(), A.numCols(), addHandle->get_c_nnz(), valuesC, row_mapC, entriesC); } -// Symbolic: count entries in each row in C to produce rowmap +// Numeric: fill the column indices and values // kernel handle has information about whether it is sorted add or not. +template +void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + const AScalar alpha, const AMatrix &A, const BScalar beta, + const BMatrix &B, CMatrix &C) { + if (!A.nnz()) { + Kokkos::deep_copy(exec, C.graph.entries, B.graph.entries); + KokkosBlas::scal(exec, C.values, beta, B.values); + } else if (!B.nnz()) { + Kokkos::deep_copy(exec, C.graph.entries, A.graph.entries); + KokkosBlas::scal(exec, C.values, alpha, A.values); + } else { + KokkosSparse::Experimental::spadd_numeric( + exec, handle, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, A.values, alpha, B.graph.row_map, B.graph.entries, + B.values, beta, C.graph.row_map, C.graph.entries, C.values); + } +} + +// One without an explicit execution space argument +template +void spadd_symbolic(KernelHandle *handle, const AMatrix &A, const BMatrix &B, + CMatrix &C) { + spadd_symbolic(typename AMatrix::execution_space{}, handle, A, B, C); +} + template -void spadd_numeric(KernelHandle* handle, const AScalar alpha, const AMatrix& A, - const BScalar beta, const BMatrix& B, CMatrix& C) { - KokkosSparse::Experimental::spadd_numeric( - handle, A.graph.row_map, A.graph.entries, A.values, alpha, - B.graph.row_map, B.graph.entries, B.values, beta, C.graph.row_map, - C.graph.entries, C.values); +void spadd_numeric(KernelHandle *handle, const AScalar alpha, const AMatrix &A, + const BScalar beta, const BMatrix &B, CMatrix &C) { + spadd_numeric(typename AMatrix::execution_space{}, handle, alpha, A, beta, B, + C); } } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_spadd_handle.hpp b/sparse/src/KokkosSparse_spadd_handle.hpp index 2902550d6a..760f912c6d 100644 --- a/sparse/src/KokkosSparse_spadd_handle.hpp +++ b/sparse/src/KokkosSparse_spadd_handle.hpp @@ -32,8 +32,46 @@ class SPADDHandle { typedef typename lno_row_view_t_::non_const_value_type size_type; typedef ExecutionSpace execution_space; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + struct SpaddCusparseData { + size_t nbytes; + void* workspace; + cusparseMatDescr_t descrA, descrB, descrC; + + SpaddCusparseData() + : nbytes(0), + workspace(nullptr), + descrA(nullptr), + descrB(nullptr), + descrC(nullptr) {} + + ~SpaddCusparseData() { + Kokkos::kokkos_free(workspace); + cusparseDestroyMatDescr(descrA); + cusparseDestroyMatDescr(descrB); + cusparseDestroyMatDescr(descrC); + } + }; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + struct SpaddRocsparseData { + rocsparse_mat_descr descrA, descrB, descrC; + + SpaddRocsparseData() : descrA(nullptr), descrB(nullptr), descrC(nullptr) {} + + ~SpaddRocsparseData() { + rocsparse_destroy_mat_descr(descrA); + rocsparse_destroy_mat_descr(descrB); + rocsparse_destroy_mat_descr(descrC); + } + }; +#endif + private: - bool input_sorted; + // if both are true, the input matrices are strict CRS + bool input_sorted; // column indices in a row are sorted + bool input_merged; // column indices in a row are unique (i.e., merged) size_type result_nnz_size; @@ -76,11 +114,20 @@ class SPADDHandle { int get_sort_option() { return this->sort_option; } +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + SpaddCusparseData cusparseData; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + SpaddRocsparseData rocsparseData; +#endif + /** * \brief Default constructor. */ - SPADDHandle(bool input_is_sorted) + SPADDHandle(bool input_is_sorted, bool input_is_merged = false) : input_sorted(input_is_sorted), + input_merged(input_is_merged), result_nnz_size(0), called_symbolic(false), called_numeric(false) {} @@ -95,6 +142,8 @@ class SPADDHandle { void set_call_numeric(bool call = true) { this->called_numeric = call; } bool is_input_sorted() { return input_sorted; } + bool is_input_merged() { return input_merged; } + bool is_input_strict_crs() { return input_sorted && input_merged; } }; } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp new file mode 100644 index 0000000000..0952654bdf --- /dev/null +++ b/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp @@ -0,0 +1,282 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_NUMERIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spadd_numeric( \ + const EXEC_SPACE_TYPE &exec, kernelhandle_t *handle, ORDINAL_TYPE m, \ + ORDINAL_TYPE n, const KOKKOS_SCALAR_TYPE alpha, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, scalar_view_t valuesA, \ + const KOKKOS_SCALAR_TYPE beta, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, scalar_view_t valuesB, rowmap_view_t rowmapC, \ + non_const_colidx_view_t colidxC, non_const_scalar_view_t valuesC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_numeric[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto &cuspData = addHandle->cusparseData; \ + auto &cuspHandle = \ + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; \ + cusparsePointerMode_t oldPtrMode; \ + \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetStream(cuspHandle, exec.cuda_stream())); \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseGetPointerMode(cuspHandle, &oldPtrMode)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetPointerMode( \ + cuspHandle, CUSPARSE_POINTER_MODE_HOST)); /* alpha, beta on host*/ \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparse##TOKEN##csrgeam2( \ + cuspHandle, m, n, reinterpret_cast(&alpha), \ + cuspData.descrA, nnzA, \ + reinterpret_cast(valuesA.data()), \ + rowmapA.data(), colidxA.data(), \ + reinterpret_cast(&beta), cuspData.descrB, \ + nnzB, reinterpret_cast(valuesB.data()), \ + rowmapB.data(), colidxB.data(), cuspData.descrC, \ + reinterpret_cast(valuesC.data()), \ + const_cast(rowmapC.data()), colidxC.data(), \ + cuspData.workspace)); \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetPointerMode(cuspHandle, oldPtrMode)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(cuspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + S, float, float, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + D, double, double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + C, Kokkos::complex, cuComplex, int, int, Kokkos::LayoutLeft, \ + Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + Z, Kokkos::complex, cuDoubleComplex, int, int, \ + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_NUMERIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spadd_numeric( \ + const EXEC_SPACE_TYPE &exec, kernelhandle_t *handle, ORDINAL_TYPE m, \ + ORDINAL_TYPE n, const KOKKOS_SCALAR_TYPE alpha, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, scalar_view_t valuesA, \ + const KOKKOS_SCALAR_TYPE beta, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, scalar_view_t valuesB, rowmap_view_t rowmapC, \ + non_const_colidx_view_t colidxC, non_const_scalar_view_t valuesC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_numeric[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto &rocData = addHandle->rocsparseData; \ + auto &rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton() \ + .rocsparseHandle; \ + rocsparse_pointer_mode oldPtrMode; \ + \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_get_pointer_mode(rocspHandle, &oldPtrMode)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode( \ + rocspHandle, rocsparse_pointer_mode_host)); /* alpha, beta on host*/ \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_##TOKEN##csrgeam( \ + rocspHandle, m, n, \ + reinterpret_cast(&alpha), rocData.descrA, \ + nnzA, reinterpret_cast(valuesA.data()), \ + rowmapA.data(), colidxA.data(), \ + reinterpret_cast(&beta), rocData.descrB, \ + nnzB, reinterpret_cast(valuesB.data()), \ + rowmapB.data(), colidxB.data(), rocData.descrC, \ + reinterpret_cast(valuesC.data()), \ + const_cast(rowmapC.data()), colidxC.data())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_pointer_mode(rocspHandle, oldPtrMode)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + s, float, float, int, int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + d, double, double, int, int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + c, Kokkos::complex, rocsparse_float_complex, int, int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + z, Kokkos::complex, rocsparse_double_complex, int, int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(false) +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp new file mode 100644 index 0000000000..fe6b51207f --- /dev/null +++ b/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp @@ -0,0 +1,238 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_SYMBOLIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + static void spadd_symbolic(const EXEC_SPACE_TYPE& exec, \ + kernelhandle_t* handle, const ORDINAL_TYPE m, \ + const ORDINAL_TYPE n, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, \ + non_const_rowmap_view_t rowmapC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_symbolic[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto& cuspData = addHandle->cusparseData; \ + auto& cuspHandle = \ + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; \ + \ + /* Not easy to init 'one' for cuda complex, so we don't init it. Anyway, \ + * the uninit'ed var won't affect C's pattern. \ + */ \ + TPL_SCALAR_TYPE one; \ + size_t nbytes; \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + OFFSET_TYPE nnzC = 0; \ + \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetStream(cuspHandle, exec.cuda_stream())); \ + \ + /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparsecreatematdescr \ + It sets the fields MatrixType and IndexBase to the default values \ + CUSPARSE_MATRIX_TYPE_GENERAL and CUSPARSE_INDEX_BASE_ZERO, \ + respectively, while leaving other fields uninitialized. */ \ + \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrA)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrB)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrC)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparse##TOKEN##csrgeam2_bufferSizeExt( \ + cuspHandle, m, n, &one, cuspData.descrA, nnzA, NULL, rowmapA.data(), \ + colidxA.data(), &one, cuspData.descrB, nnzB, NULL, rowmapB.data(), \ + colidxB.data(), cuspData.descrC, NULL, rowmapC.data(), NULL, \ + &nbytes)); \ + cuspData.nbytes = nbytes; \ + cuspData.workspace = Kokkos::kokkos_malloc(nbytes); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz( \ + cuspHandle, m, n, cuspData.descrA, nnzA, rowmapA.data(), \ + colidxA.data(), cuspData.descrB, nnzB, rowmapB.data(), \ + colidxB.data(), cuspData.descrC, rowmapC.data(), &nnzC, \ + cuspData.workspace)); \ + addHandle->set_c_nnz(nnzC); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(cuspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + S, float, float, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + D, double, double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + C, Kokkos::complex, cuComplex, int, int, Kokkos::LayoutLeft, \ + Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + Z, Kokkos::complex, cuDoubleComplex, int, int, \ + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + KOKKOS_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_SYMBOLIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + static void spadd_symbolic(const EXEC_SPACE_TYPE& exec, \ + kernelhandle_t* handle, const ORDINAL_TYPE m, \ + const ORDINAL_TYPE n, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, \ + non_const_rowmap_view_t rowmapC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_symbolic[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto& rocData = addHandle->rocsparseData; \ + auto& rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton() \ + .rocsparseHandle; \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + OFFSET_TYPE nnzC = 0; \ + \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrA)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrB)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrC)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgeam_nnz( \ + rocspHandle, m, n, rocData.descrA, nnzA, rowmapA.data(), \ + colidxA.data(), rocData.descrB, nnzB, rowmapB.data(), \ + colidxB.data(), rocData.descrC, rowmapC.data(), &nnzC)); \ + addHandle->set_c_nnz(nnzC); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT( \ + ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + float, rocsparse_int, rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + double, rocsparse_int, rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + Kokkos::complex, rocsparse_int, rocsparse_int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + Kokkos::complex, rocsparse_int, rocsparse_int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT(false) +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp index b654c4331c..6d4db8731f 100644 --- a/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp @@ -21,20 +21,125 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists // -template +template struct spadd_symbolic_tpl_spec_avail { enum : bool { value = false }; }; -template +template struct spadd_numeric_tpl_spec_avail { enum : bool { value = false }; }; +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_symbolic_tpl_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_numeric_tpl_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL( \ + ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL(float, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL(double, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL(float, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL(double, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL(int, int, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL(rocsparse_int, rocsparse_int, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +#endif + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp deleted file mode 100644 index 8f5ad83ed7..0000000000 --- a/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ -#define KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ - -namespace KokkosSparse { -namespace Impl {} -} // namespace KokkosSparse - -#endif diff --git a/sparse/unit_test/Test_Sparse_spadd.hpp b/sparse/unit_test/Test_Sparse_spadd.hpp index 05ff97bb3a..3156801dbd 100644 --- a/sparse/unit_test/Test_Sparse_spadd.hpp +++ b/sparse/unit_test/Test_Sparse_spadd.hpp @@ -32,7 +32,11 @@ typedef Kokkos::complex kokkos_complex_double; typedef Kokkos::complex kokkos_complex_float; -// Create a random square matrix for testing mat-mat addition kernels +// Create a random nrows by ncols matrix for testing mat-mat addition kernels. +// minNNZ, maxNNZ: min and max number of nonzeros in any row. +// maxNNZ > ncols will result in duplicated entries in a row, otherwise entries +// in a row are unique. +// sortRows: whether to sort columns in a row template crsMat_t randomMatrix(ordinal_type nrows, ordinal_type ncols, ordinal_type minNNZ, ordinal_type maxNNZ, bool sortRows) { @@ -117,7 +121,9 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, srand((numRows << 1) ^ numCols); KernelHandle handle; - handle.create_spadd_handle(sortRows); + // If maxNNZ <= numCols, the generated A, B have unique column indices in each + // row + handle.create_spadd_handle(sortRows, static_cast(maxNNZ) <= numCols); crsMat_t A = randomMatrix(numRows, numCols, minNNZ, maxNNZ, sortRows); crsMat_t B = @@ -129,9 +135,10 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, // initialized Kokkos::deep_copy(c_row_map, (size_type)5); auto addHandle = handle.get_spadd_handle(); - KokkosSparse::Experimental::spadd_symbolic(&handle, A.graph.row_map, - A.graph.entries, B.graph.row_map, - B.graph.entries, c_row_map); + typename Device::execution_space exec{}; + KokkosSparse::Experimental::spadd_symbolic( + exec, &handle, numRows, numCols, A.graph.row_map, A.graph.entries, + B.graph.row_map, B.graph.entries, c_row_map); size_type c_nnz = addHandle->get_c_nnz(); // Fill values, entries with incorrect incorret values_type c_values( @@ -140,9 +147,9 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, entries_type c_entries("C entries", c_nnz); Kokkos::deep_copy(c_entries, (lno_t)5); KokkosSparse::Experimental::spadd_numeric( - &handle, A.graph.row_map, A.graph.entries, A.values, KAT::one(), - B.graph.row_map, B.graph.entries, B.values, KAT::one(), c_row_map, - c_entries, c_values); + exec, &handle, numRows, numCols, A.graph.row_map, A.graph.entries, + A.values, KAT::one(), B.graph.row_map, B.graph.entries, B.values, + KAT::one(), c_row_map, c_entries, c_values); // done with handle // create C using CRS arrays crsMat_t C("C", numRows, numCols, c_nnz, c_values, c_row_map, c_entries); From aa1259797cb5a7d5ae7e4ae3563a39bf7ae3364d Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 17 Jan 2024 07:56:36 -0700 Subject: [PATCH 148/326] Make spiluk_handle::reset backwards compatible (#2087) * Make spiluk_handle::reset backwards compatible By making block_size default to -1, which means don't change block size. * Switch default val for block_size for reset_handle * formatting * Fix comment --- sparse/src/KokkosSparse_spiluk_handle.hpp | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/sparse/src/KokkosSparse_spiluk_handle.hpp b/sparse/src/KokkosSparse_spiluk_handle.hpp index a7298a7773..2b37d08f6e 100644 --- a/sparse/src/KokkosSparse_spiluk_handle.hpp +++ b/sparse/src/KokkosSparse_spiluk_handle.hpp @@ -131,22 +131,28 @@ class SPILUKHandle { team_size(-1), vector_size(-1) {} - void reset_handle(const size_type nrows_, const size_type nnzL_, - const size_type nnzU_, const size_type block_size_) { + void reset_handle( + const size_type nrows_, const size_type nnzL_, const size_type nnzU_, + const size_type block_size_ = Kokkos::ArithTraits::max()) { set_nrows(nrows_); set_num_levels(0); set_nnzL(nnzL_); set_nnzU(nnzU_); - set_block_size(block_size_); + // user likely does not want to reset block size to 0, so set default + // to size_type::max + if (block_size_ != Kokkos::ArithTraits::max()) { + set_block_size(block_size_); + } set_level_maxrows(0); set_level_maxrowsperchunk(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - hlevel_ptr = nnz_lno_view_host_t("hlevel_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_host_t(), - level_nrowsperchunk = nnz_lno_view_host_t(), reset_symbolic_complete(), + level_list = nnz_row_view_t("level_list", nrows_); + level_idx = nnz_lno_view_t("level_idx", nrows_); + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1); + hlevel_ptr = nnz_lno_view_host_t("hlevel_ptr", nrows_ + 1); + level_nchunks = nnz_lno_view_host_t(); + level_nrowsperchunk = nnz_lno_view_host_t(); iw = work_view_t(); + reset_symbolic_complete(); } virtual ~SPILUKHandle(){}; From 0b1d20fd502659fb44059cf5de5f772249546f91 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Sat, 20 Jan 2024 17:23:13 -0600 Subject: [PATCH 149/326] spadd: add APIs without an execution space argument (#2090) --- sparse/src/KokkosSparse_spadd.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sparse/src/KokkosSparse_spadd.hpp b/sparse/src/KokkosSparse_spadd.hpp index 4151ea6783..127400c752 100644 --- a/sparse/src/KokkosSparse_spadd.hpp +++ b/sparse/src/KokkosSparse_spadd.hpp @@ -106,6 +106,12 @@ void spadd_symbolic( } } +// one without an execution space arg +template +void spadd_symbolic(KernelHandle *handle, Args... args) { + spadd_symbolic(typename KernelHandle::HandleExecSpace{}, handle, args...); +} + template +void spadd_numeric(KernelHandle *handle, Args... args) { + spadd_numeric(typename KernelHandle::HandleExecSpace{}, handle, args...); +} } // namespace Experimental // Symbolic: count entries in each row in C to produce rowmap From 9fa4a081c20ef5924c4def06d5bb21b288e1cdb4 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 5 Feb 2024 17:21:05 -0700 Subject: [PATCH 150/326] Lapack - SVD: adding initial files that do not implement anything (#2092) Adding SVD feature to Lapack component, the interface is similar to classic Lapack and the implementation relies on the TPL layer to provide initial capabilities. The TPL supported are LAPACK, MKL, cuSOLVER and rocSOLVER. Testing three analytical cases 2x2, 2x3 and 3x2 and then some randomly generated matrices. --- .gitignore | 5 +- lapack/CMakeLists.txt | 7 + .../svd/KokkosLapack_svd_eti_spec_inst.cpp.in | 26 + .../KokkosLapack_svd_eti_spec_avail.hpp.in | 24 + lapack/impl/KokkosLapack_svd_impl.hpp | 34 + lapack/impl/KokkosLapack_svd_spec.hpp | 156 ++++ lapack/src/KokkosLapack_svd.hpp | 246 +++++++ lapack/tpls/KokkosLapack_Host_tpl.cpp | 66 ++ lapack/tpls/KokkosLapack_Host_tpl.hpp | 6 + .../tpls/KokkosLapack_svd_tpl_spec_avail.hpp | 171 +++++ .../tpls/KokkosLapack_svd_tpl_spec_decl.hpp | 687 ++++++++++++++++++ lapack/unit_test/Test_Lapack.hpp | 1 + lapack/unit_test/Test_Lapack_svd.hpp | 642 ++++++++++++++++ 13 files changed, 2070 insertions(+), 1 deletion(-) create mode 100644 lapack/eti/generated_specializations_cpp/svd/KokkosLapack_svd_eti_spec_inst.cpp.in create mode 100644 lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_avail.hpp.in create mode 100644 lapack/impl/KokkosLapack_svd_impl.hpp create mode 100644 lapack/impl/KokkosLapack_svd_spec.hpp create mode 100644 lapack/src/KokkosLapack_svd.hpp create mode 100644 lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp create mode 100644 lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp create mode 100644 lapack/unit_test/Test_Lapack_svd.hpp diff --git a/.gitignore b/.gitignore index d64726e92e..6dcc5d6a5d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,7 @@ TAGS #Clangd indexing compile_commands.json .cache/ -.vscode/ \ No newline at end of file +.vscode/ + +#MacOS hidden files +.DS_Store diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index ee91079378..f825a2184a 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -58,3 +58,10 @@ KOKKOSKERNELS_GENERATE_ETI(Lapack_trtri trtri SOURCE_LIST SOURCES TYPE_LISTS FLOATS LAYOUTS DEVICES ) + +KOKKOSKERNELS_GENERATE_ETI(Lapack_svd svd + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) diff --git a/lapack/eti/generated_specializations_cpp/svd/KokkosLapack_svd_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/svd/KokkosLapack_svd_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..62dd75475f --- /dev/null +++ b/lapack/eti/generated_specializations_cpp/svd/KokkosLapack_svd_eti_spec_inst.cpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosLapack_svd_spec.hpp" + +namespace KokkosLapack { +namespace Impl { +@LAPACK_SVD_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..49e526b7e8 --- /dev/null +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_avail.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { +namespace Impl { +@LAPACK_SVD_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/lapack/impl/KokkosLapack_svd_impl.hpp b/lapack/impl/KokkosLapack_svd_impl.hpp new file mode 100644 index 0000000000..49df758936 --- /dev/null +++ b/lapack/impl/KokkosLapack_svd_impl.hpp @@ -0,0 +1,34 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_IMPL_SVD_HPP_ +#define KOKKOSLAPACK_IMPL_SVD_HPP_ + +/// \file KokkosLapack_svd_impl.hpp +/// \brief Implementation(s) of singular value decomposition of a dense matrix. + +#include +#include + +namespace KokkosLapack { +namespace Impl { + +// NOTE: Might add the implementation of KokkosLapack::svd later + +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_IMPL_SVD_HPP diff --git a/lapack/impl/KokkosLapack_svd_spec.hpp b/lapack/impl/KokkosLapack_svd_spec.hpp new file mode 100644 index 0000000000..fc0a34f790 --- /dev/null +++ b/lapack/impl/KokkosLapack_svd_spec.hpp @@ -0,0 +1,156 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ +#define KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ + +#include +#include +#include + +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosLapack { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct svd_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosLapack + +// +// Macro for declaration of full specialization availability +// KokkosLapack::Impl::SVD. This is NOT for users!!! All +// the declarations of full specializations go in this header file. +// We may spread out definitions (see _INST macro below) across one or +// more .cpp files. +// +#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct svd_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosLapack { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosLapack::svd. + +template ::value, + bool eti_spec_avail = svd_eti_spec_avail< + ExecutionSpace, AMatrix, SVector, UMatrix, VMatrix>::value> +struct SVD { + static void svd(const ExecutionSpace &space, const char jobu[], + const char jobvt[], const AMatrix &A, const SVector &S, + const UMatrix &U, const VMatrix &Vt); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +//! Full specialization of svd +// Unification layer +template +struct SVD { + static void svd(const ExecutionSpace & /* space */, const char * /* jobu */, + const char * /* jobvt */, const AMatrix & /* A */, + const SVector & /* S */, const UMatrix & /* U */, + const VMatrix & /* Vt */) { + // NOTE: Might add the implementation of KokkosLapack::svd later + throw std::runtime_error( + "No fallback implementation of SVD (singular value decomposition) " + "exists. Enable LAPACK, CUSOLVER or ROCSOLVER TPL to use this " + "function."); + } +}; + +#endif +} // namespace Impl +} // namespace KokkosLapack + +// +// Macro for declaration of full specialization of +// KokkosLapack::Impl::SVD. This is NOT for users!!! All +// the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or +// more .cpp files. +// +#define KOKKOSLAPACK_SVD_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#define KOKKOSLAPACK_SVD_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#include + +#endif // KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ diff --git a/lapack/src/KokkosLapack_svd.hpp b/lapack/src/KokkosLapack_svd.hpp new file mode 100644 index 0000000000..71ea7cc30f --- /dev/null +++ b/lapack/src/KokkosLapack_svd.hpp @@ -0,0 +1,246 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosLapack_svd.hpp +/// \brief Singular Value Decomposition (SVD) +/// +/// This file provides KokkosLapack::svd. This function performs a +/// local (no MPI) singular value decomposition of the input matrix A +/// and returns the singular values and vectors dedending on input flags. + +#ifndef KOKKOSLAPACK_SVD_HPP_ +#define KOKKOSLAPACK_SVD_HPP_ + +#include + +#include "KokkosLapack_svd_spec.hpp" +#include "KokkosKernels_Error.hpp" + +namespace KokkosLapack { + +// clang-format off +/// \brief Compute the Singular Value Decomposition of A = U*S*Vt +/// +/// \tparam ExecutionSpace the space where the kernel will run. +/// \tparam AMatrix (mxn) matrix as a rank-2 Kokkos::View. +/// \tparam SVector min(m,n) vector as a rank-1 Kokkos::View +/// \tparam UMatrix (mxm) matrix as a rank-2 Kokkos::View +/// \tparam VMatrix (nxn) matrix as a rank-2 Kokkos::View +/// +/// \param space [in] execution space instance used to specified how to execute +/// the svd kernels. +/// \param jobu [in] flag to control the computation of the left singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param jobvt [in] flag to control the computation of the right singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param A [in] An m-by-n matrix to be decomposed using its singular values. +/// \param S [out] Vector of the min(m, n) singular values of A. +/// \param U [out] the first min(m, n) columns of U are the left singular +/// vectors of A. +/// \param Vt [out] the first min(m, n) columns of Vt are the right singular +/// vectors of A. +/// +// clang-format on +template +void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], + const AMatrix& A, const SVector& S, const UMatrix& U, + const VMatrix& Vt) { + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: S must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: U must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: Vt must be a Kokkos::View."); + static_assert(AMatrix::rank() == 2, "KokkosLapack::svd: A must have rank 2."); + static_assert(SVector::rank() == 1, "KokkosLapack::svd: S must have rank 1."); + static_assert(UMatrix::rank() == 2, "KokkosLapack::svd: U must have rank 2."); + static_assert(VMatrix::rank() == 2, + "KokkosLapack::svd: Vt must have rank 2."); + + int64_t m = A.extent(0); + int64_t n = A.extent(1); + int64_t rankA = Kokkos::min(m, n); + + // No work to do since the matrix is empty... + // Also do not send a matrix with size zero + // to Lapack TPLs or they will complain! + if ((m == 0) || (n == 0)) { + return; + } + + // Check the jobu and jobvt control flags + // The only valid options there are 'A', 'S', 'O' and 'N' + const bool is_jobu_invalid = + !((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || + (jobu[0] == 's') || (jobu[0] == 'O') || (jobu[0] == 'o') || + (jobu[0] == 'N') || (jobu[0] == 'n')); + + const bool is_jobvt_invalid = + !((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || + (jobvt[0] == 's') || (jobvt[0] == 'O') || (jobvt[0] == 'o') || + (jobvt[0] == 'N') || (jobvt[0] == 'n')); + + if (is_jobu_invalid && is_jobvt_invalid) { + std::ostringstream oss; + oss << "KokkosLapack::svd: both jobu and jobvt are invalid!\n" + << "Possible values are A, S, O or N, submitted values are " << jobu[0] + << " and " << jobvt[0] << "\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + if (is_jobu_invalid) { + std::ostringstream oss; + oss << "KokkosLapack::svd: jobu is invalid!\n" + << "Possible values are A, S, O or N, submitted value is " << jobu[0] + << "\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + if (is_jobvt_invalid) { + std::ostringstream oss; + oss << "KokkosLapack::svd: jobvt is invalid!\n" + << "Possible values are A, S, O or N, submitted value is " << jobvt[0] + << "\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + + if (((jobu[0] == 'O') || (jobu[0] == 'o')) && + ((jobvt[0] == 'O') || (jobvt[0] == 'o'))) { + std::ostringstream oss; + oss << "KokkosLapack::svd: jobu and jobvt cannot be O at the same time!\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + + // Check validity of output views sizes + // Note that of jobu/jobvt are set to O or N + // then the associated matrix does not need storage + bool is_extent_invalid = false; + std::ostringstream os; + if (S.extent_int(0) != rankA) { + is_extent_invalid = true; + os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " + << rankA << ".\n"; + } + if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || + (jobu[0] == 's')) { + if (U.extent_int(0) != m || U.extent_int(1) != m) { + is_extent_invalid = true; + os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " + << U.extent(1) << ") instead of (" << m << ", " << m << ").\n"; + } + } + if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || + (jobvt[0] == 's')) { + if (Vt.extent_int(0) != n || Vt.extent_int(1) != n) { + is_extent_invalid = true; + os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " + << Vt.extent(1) << ") instead of (" << n << ", " << n << ").\n"; + } + } + if (is_extent_invalid) { + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + if (std::is_same_v && + (A.extent(0) < A.extent(1))) { + throw std::runtime_error( + "CUSOLVER does not support SVD for matrices with more columns " + "than rows, you can transpose you matrix first then compute " + "SVD of that transpose: At=VSUt, and swap the output U and Vt" + " and transpose them to recover the desired SVD."); + } +#endif + + using AMatrix_Internal = Kokkos::View< + typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, + typename AMatrix::device_type, Kokkos::MemoryTraits>; + + using SVector_Internal = Kokkos::View< + typename SVector::non_const_value_type*, typename SVector::array_layout, + typename SVector::device_type, Kokkos::MemoryTraits>; + + using UMatrix_Internal = Kokkos::View< + typename UMatrix::non_const_value_type**, typename UMatrix::array_layout, + typename UMatrix::device_type, Kokkos::MemoryTraits>; + + using VMatrix_Internal = Kokkos::View< + typename VMatrix::non_const_value_type**, typename VMatrix::array_layout, + typename VMatrix::device_type, Kokkos::MemoryTraits>; + + AMatrix_Internal A_i = A; + SVector_Internal S_i = S; + UMatrix_Internal U_i = U; + VMatrix_Internal Vt_i = Vt; + + KokkosLapack::Impl::SVD::svd(space, jobu, + jobvt, A_i, + S_i, U_i, + Vt_i); +} + +// clang-format off +/// \brief Compute the Singular Value Decomposition of A = U*S*Vt +/// +/// \tparam AMatrix (mxn) matrix as a rank-2 Kokkos::View. +/// \tparam SVector min(m,n) vector as a rank-1 Kokkos::View +/// \tparam UMatrix (mxm) matrix as a rank-2 Kokkos::View +/// \tparam VMatrix (nxn) matrix as a rank-2 Kokkos::View +/// +/// \param jobu [in] flag to control the computation of the left singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param jobvt [in] flag to control the computation of the right singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param A [in] An m-by-n matrix to be decomposed using its singular values. +/// \param S [out] Vector of the min(m, n) singular values of A. +/// \param U [out] the first min(m, n) columns of U are the left singular +/// vectors of A. +/// \param Vt [out] the first min(m, n) columns of Vt are the right singular +/// vectors of A. +/// +// clang-format on +template +void svd(const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { + typename AMatrix::execution_space space{}; + svd(space, jobu, jobvt, A, S, U, Vt); +} + +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_SVD_HPP_ diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp index d629a17f1d..add0a802bd 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -38,6 +38,31 @@ void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); +/// +/// Gesvd +/// + +void F77_BLAS_MANGLE(sgesvd, SGESVD)(const char*, const char*, const int*, + const int*, float*, const int*, float*, + float*, const int*, float*, const int*, + float*, int*, int*); +void F77_BLAS_MANGLE(dgesvd, DGESVD)(const char*, const char*, const int*, + const int*, double*, const int*, double*, + double*, const int*, double*, const int*, + double*, int*, int*); +void F77_BLAS_MANGLE(cgesvd, CGESVD)(const char*, const char*, const int*, + const int*, std::complex*, + const int*, float*, std::complex*, + const int*, std::complex*, + const int*, std::complex*, int*, + float*, int*); +void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, + const int*, std::complex*, + const int*, double*, std::complex*, + const int*, std::complex*, + const int*, std::complex*, int*, + double*, int*); + /// /// Trtri /// @@ -64,6 +89,11 @@ void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, #define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) #define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) +#define F77_FUNC_SGESVD F77_BLAS_MANGLE(sgesvd, SGESVD) +#define F77_FUNC_DGESVD F77_BLAS_MANGLE(dgesvd, DGESVD) +#define F77_FUNC_CGESVD F77_BLAS_MANGLE(cgesvd, CGESVD) +#define F77_FUNC_ZGESVD F77_BLAS_MANGLE(zgesvd, ZGESVD) + #define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) #define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) #define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) @@ -82,6 +112,15 @@ void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, + const int n, float* a, const int lda, float* s, + float* u, const int ldu, float* vt, + const int ldvt, float* work, int lwork, + float* /*rwork*/, int info) { + F77_FUNC_SGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, &info); +} +template <> int HostLapack::trtri(const char uplo, const char diag, int n, const float* a, int lda) { int info = 0; @@ -99,6 +138,15 @@ void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, + const int n, double* a, const int lda, double* s, + double* u, const int ldu, double* vt, + const int ldvt, double* work, int lwork, + double* /*rwork*/, int info) { + F77_FUNC_DGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, &info); +} +template <> int HostLapack::trtri(const char uplo, const char diag, int n, const double* a, int lda) { int info = 0; @@ -118,6 +166,15 @@ void HostLapack >::gesv(int n, int rhs, F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack >::gesvd( + const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, float* s, std::complex* u, + const int ldu, std::complex* vt, const int ldvt, + std::complex* work, int lwork, float* rwork, int info) { + F77_FUNC_CGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, rwork, &info); +} +template <> int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, int lda) { @@ -138,6 +195,15 @@ void HostLapack >::gesv(int n, int rhs, F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack >::gesvd( + const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, double* s, std::complex* u, + const int ldu, std::complex* vt, const int ldvt, + std::complex* work, int lwork, double* rwork, int info) { + F77_FUNC_ZGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, rwork, &info); +} +template <> int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, diff --git a/lapack/tpls/KokkosLapack_Host_tpl.hpp b/lapack/tpls/KokkosLapack_Host_tpl.hpp index d74099aaec..9eca83afea 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.hpp @@ -33,6 +33,12 @@ struct HostLapack { static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, int info); + static void gesvd(const char jobu, const char jobvt, const int m, const int n, + T *A, const int lda, + typename Kokkos::ArithTraits::mag_type *S, T *U, + const int ldu, T *Vt, const int ldvt, T *work, int lwork, + typename Kokkos::ArithTraits::mag_type *rwork, int info); + static int trtri(const char uplo, const char diag, int n, const T *a, int lda); }; diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp new file mode 100644 index 0000000000..7a7403209f --- /dev/null +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp @@ -0,0 +1,171 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosLapack { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct svd_tpl_spec_avail { + enum : bool { value = false }; +}; + +// LAPACK +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ + defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Serial) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::OpenMP) +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Threads) +#endif + +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK || KOKKOSKERNELS_ENABLE_TPL_MKL + +// CUSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif // CUDAUVMSPACE +#endif // CUSOLVER + +// ROCSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +#endif // HIPMANAGEDSPACE +#endif // ROCSOLVER + +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_HPP_ diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp new file mode 100644 index 0000000000..bc23068c57 --- /dev/null +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp @@ -0,0 +1,687 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_SVD_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_SVD_TPL_SPEC_DECL_HPP_ + +#include "KokkosKernels_Error.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosLapack { +namespace Impl { +template +inline void svd_print_specialization() { +#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + if constexpr (std::is_same_v) { + printf( + "KokkosLapack::svd<> TPL Cusolver specialization for < %s , %s, %s, %s " + ">\n", + typeid(AMatrix).name(), typeid(SVector).name(), typeid(UMatrix).name(), + typeid(VMatrix).name()); + } +#endif +#endif +} +} // namespace Impl +} // namespace KokkosLapack + +// LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#include "KokkosLapack_Host_tpl.hpp" + +namespace KokkosLapack { +namespace Impl { + +template +void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const int m = A.extent_int(0); + const int n = A.extent_int(1); + const int lda = A.stride(1); + const int ldu = U.stride(1); + const int ldvt = Vt.stride(1); + + int lwork = -1, info = 0; + Kokkos::View rwork("svd rwork buffer", + 5 * Kokkos::min(m, n)); + Kokkos::View work("svd work buffer", 1); + if constexpr (Kokkos::ArithTraits::is_complex) { + HostLapack>::gesvd( + jobu[0], jobvt[0], m, n, + reinterpret_cast*>(A.data()), lda, S.data(), + reinterpret_cast*>(U.data()), ldu, + reinterpret_cast*>(Vt.data()), ldvt, + reinterpret_cast*>(work.data()), lwork, + rwork.data(), info); + + lwork = static_cast(work(0).real()); + + work = Kokkos::View("svd work buffer", lwork); + HostLapack>::gesvd( + jobu[0], jobvt[0], m, n, + reinterpret_cast*>(A.data()), lda, S.data(), + reinterpret_cast*>(U.data()), ldu, + reinterpret_cast*>(Vt.data()), ldvt, + reinterpret_cast*>(work.data()), lwork, + rwork.data(), info); + } else { + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), + lwork, rwork.data(), info); + + lwork = static_cast(work(0)); + + work = Kokkos::View("svd work buffer", lwork); + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), + lwork, rwork.data(), info); + } +} + +#define KOKKOSLAPACK_SVD_LAPACK(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + lapackSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP) +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" + +namespace KokkosLapack { +namespace Impl { + +template +void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const lapack_int m = A.extent_int(0); + const lapack_int n = A.extent_int(1); + const lapack_int lda = A.stride(1); + const lapack_int ldu = U.stride(1); + const lapack_int ldvt = Vt.stride(1); + + Kokkos::View rwork("svd rwork buffer", + Kokkos::min(m, n) - 1); + lapack_int ret = 0; + if constexpr (std::is_same_v) { + ret = + LAPACKE_sgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, + S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + } + if constexpr (std::is_same_v) { + ret = + LAPACKE_dgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, + S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + } + if constexpr (std::is_same_v>) { + ret = LAPACKE_cgesvd( + LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data()); + } + if constexpr (std::is_same_v>) { + ret = LAPACKE_zgesvd( + LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, + rwork.data()); + } + + if (ret != 0) { + std::ostringstream os; + os << "KokkosLapack::svd: MKL failed with return value: " << ret << "\n"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +} + +#define KOKKOSLAPACK_SVD_MKL(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + mklSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP) +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL + +// CUSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" + +namespace KokkosLapack { +namespace Impl { + +template +void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const int m = A.extent_int(0); + const int n = A.extent_int(1); + const int lda = A.stride(1); + const int ldu = U.stride(1); + const int ldvt = Vt.stride(1); + + int lwork = 0; + Kokkos::View info("svd info"); + Kokkos::View rwork("svd rwork buffer", + Kokkos::min(m, n) - 1); + + CudaLapackSingleton& s = CudaLapackSingleton::singleton(); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSetStream(s.handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd( + s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd( + s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, + reinterpret_cast(work.data()), lwork, + rwork.data(), info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnZgesvd(s.handle, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, + S.data(), reinterpret_cast(U.data()), + ldu, reinterpret_cast(Vt.data()), + ldvt, reinterpret_cast(work.data()), + lwork, rwork.data(), info.data())); + } + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::Cuda& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_CUSOLVER," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + cusolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + +// ROCSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#include +#include + +namespace KokkosLapack { +namespace Impl { + +template +void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const rocblas_int m = A.extent_int(0); + const rocblas_int n = A.extent_int(1); + const rocblas_int lda = A.stride(1); + const rocblas_int ldu = U.stride(1); + const rocblas_int ldvt = Vt.stride(1); + + rocblas_svect UVecMode = rocblas_svect_all; + if ((jobu[0] == 'S') || (jobu[0] == 's')) { + UVecMode = rocblas_svect_singular; + } else if ((jobu[0] == 'O') || (jobu[0] == 'o')) { + UVecMode = rocblas_svect_overwrite; + } else if ((jobu[0] == 'N') || (jobu[0] == 'n')) { + UVecMode = rocblas_svect_none; + } + rocblas_svect VVecMode = rocblas_svect_all; + if ((jobvt[0] == 'S') || (jobvt[0] == 's')) { + VVecMode = rocblas_svect_singular; + } else if ((jobvt[0] == 'O') || (jobvt[0] == 'o')) { + VVecMode = rocblas_svect_overwrite; + } else if ((jobvt[0] == 'N') || (jobvt[0] == 'n')) { + VVecMode = rocblas_svect_none; + } + + const rocblas_workmode WorkMode = rocblas_outofplace; + + Kokkos::View info("svd info"); + Kokkos::View rwork("svd rwork buffer", + Kokkos::min(m, n) - 1); + + KokkosBlas::Impl::RocBlasSingleton& s = + KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_set_stream(s.handle, space.hip_stream())); + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd( + s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd( + s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesvd( + s.handle, UVecMode, VVecMode, m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data(), + WorkMode, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesvd( + s.handle, UVecMode, VVecMode, m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, + rwork.data(), WorkMode, info.data())); + } + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::HIP& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_ROCSOLVER," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + rocsolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) +KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER + +#endif // KOKKOSLAPACK_SVD_TPL_SPEC_DECL_HPP_ diff --git a/lapack/unit_test/Test_Lapack.hpp b/lapack/unit_test/Test_Lapack.hpp index 815c442884..1a717521f8 100644 --- a/lapack/unit_test/Test_Lapack.hpp +++ b/lapack/unit_test/Test_Lapack.hpp @@ -18,5 +18,6 @@ #include "Test_Lapack_gesv.hpp" #include "Test_Lapack_trtri.hpp" +#include "Test_Lapack_svd.hpp" #endif // TEST_LAPACK_HPP diff --git a/lapack/unit_test/Test_Lapack_svd.hpp b/lapack/unit_test/Test_Lapack_svd.hpp new file mode 100644 index 0000000000..a0a1f31ab0 --- /dev/null +++ b/lapack/unit_test/Test_Lapack_svd.hpp @@ -0,0 +1,642 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include +#include + +#include + +namespace Test { + +template +void check_triple_product( + const AMatrix& A, const SVector& S, const UMatrix& U, const VMatrix& Vt, + typename Kokkos::ArithTraits< + typename AMatrix::non_const_value_type>::mag_type tol) { + // After a successful SVD decomposition we have A=U*S*V + // So using gemm we should be able to compare the above + // triple product to the original matrix A. + using execution_space = typename AMatrix::execution_space; + + AMatrix temp("intermediate U*S product", A.extent(0), A.extent(1)); + AMatrix M("U*S*V product", A.extent(0), A.extent(1)); + + // First compute the left side of the product: temp = U*S + Kokkos::parallel_for( + Kokkos::RangePolicy(0, U.extent_int(0)), + KOKKOS_LAMBDA(const int& rowIdx) { + for (int colIdx = 0; colIdx < U.extent_int(1); ++colIdx) { + if (colIdx < S.extent_int(0)) { + temp(rowIdx, colIdx) = U(rowIdx, colIdx) * S(colIdx); + } + } + }); + + // Second compute the right side of the product: M = temp*V = U*S*V + KokkosBlas::gemm("N", "N", 1, temp, Vt, 0, M); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + typename AMatrix::HostMirror M_h = Kokkos::create_mirror_view(M); + Kokkos::deep_copy(A_h, A); + Kokkos::deep_copy(M_h, M); + for (int rowIdx = 0; rowIdx < A.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < A.extent_int(1); ++colIdx) { + if (tol < Kokkos::abs(A_h(rowIdx, colIdx))) { + EXPECT_NEAR_KK_REL(A_h(rowIdx, colIdx), M_h(rowIdx, colIdx), tol); + } else { + EXPECT_NEAR_KK(A_h(rowIdx, colIdx), M_h(rowIdx, colIdx), tol); + } + } + } +} + +template +void check_unitary_orthogonal_matrix( + const Matrix& M, typename Kokkos::ArithTraits< + typename Matrix::non_const_value_type>::mag_type tol) { + // After a successful SVD decomposition the matrices + // U and V are unitary matrices. Thus we can check + // the property UUt=UtU=I and VVt=VtV=I using gemm. + using scalar_type = typename Matrix::non_const_value_type; + + Matrix I0("M*Mt", M.extent(0), M.extent(0)); + KokkosBlas::gemm("N", "C", 1, M, M, 0, I0); + typename Matrix::HostMirror I0_h = Kokkos::create_mirror_view(I0); + Kokkos::deep_copy(I0_h, I0); + for (int rowIdx = 0; rowIdx < M.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < M.extent_int(0); ++colIdx) { + if (rowIdx == colIdx) { + EXPECT_NEAR_KK_REL(I0_h(rowIdx, colIdx), + Kokkos::ArithTraits::one(), tol); + } else { + EXPECT_NEAR_KK(I0_h(rowIdx, colIdx), + Kokkos::ArithTraits::zero(), tol); + } + } + } + + Matrix I1("Mt*M", M.extent(1), M.extent(1)); + KokkosBlas::gemm("C", "N", 1, M, M, 0, I1); + typename Matrix::HostMirror I1_h = Kokkos::create_mirror_view(I1); + Kokkos::deep_copy(I1_h, I1); + for (int rowIdx = 0; rowIdx < M.extent_int(1); ++rowIdx) { + for (int colIdx = 0; colIdx < M.extent_int(1); ++colIdx) { + if (rowIdx == colIdx) { + EXPECT_NEAR_KK_REL(I1_h(rowIdx, colIdx), + Kokkos::ArithTraits::one(), tol); + } else { + EXPECT_NEAR_KK(I1_h(rowIdx, colIdx), + Kokkos::ArithTraits::zero(), tol); + } + } + } +} + +template +int impl_analytic_2x2_svd() { + using scalar_type = typename AMatrix::value_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + using vector_type = + Kokkos::View; + using KAT_S = Kokkos::ArithTraits; + + const mag_type eps = KAT_S::eps(); + + AMatrix A("A", 2, 2), U("U", 2, 2), Vt("Vt", 2, 2), Aref("A ref", 2, 2); + vector_type S("S", 2); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + + // A = [3 0] + // [4 5] + // USV = 1/sqrt(10) [1 -3] * sqrt(5) [3 0] * 1/sqrt(2) [ 1 1] + // [3 1] [0 1] [-1 1] + A_h(0, 0) = 3; + A_h(1, 0) = 4; + A_h(1, 1) = 5; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(Aref, A_h); + + KokkosLapack::svd("A", "A", A, S, U, Vt); + // Don't really need to fence here as we deep_copy right after... + + typename vector_type::HostMirror S_h = Kokkos::create_mirror_view(S); + Kokkos::deep_copy(S_h, S); + typename AMatrix::HostMirror U_h = Kokkos::create_mirror_view(U); + Kokkos::deep_copy(U_h, U); + typename AMatrix::HostMirror Vt_h = Kokkos::create_mirror_view(Vt); + Kokkos::deep_copy(Vt_h, Vt); + + // The singular values for this problem + // are known: sqrt(45) and sqrt(5) + EXPECT_NEAR_KK_REL(S_h(0), static_cast(Kokkos::sqrt(45)), + 100 * eps); + EXPECT_NEAR_KK_REL(S_h(1), static_cast(Kokkos::sqrt(5)), 100 * eps); + + // The singular vectors should be identical + // or of oposite sign we check the first + // component of the vectors to determine + // the proper signed comparison. + std::vector Uref = { + static_cast(1 / Kokkos::sqrt(10)), + static_cast(3 / Kokkos::sqrt(10)), + static_cast(-3 / Kokkos::sqrt(10)), + static_cast(1 / Kokkos::sqrt(10))}; + std::vector Vtref = { + static_cast(1 / Kokkos::sqrt(2)), + static_cast(-1 / Kokkos::sqrt(2)), + static_cast(1 / Kokkos::sqrt(2)), + static_cast(1 / Kokkos::sqrt(2))}; + + // Both rotations and reflections are valid + // vector basis so we need to check both signs + // to confirm proper SVD was achieved. + Kokkos::View U_real("U real", 2, 2), + Vt_real("Vt real", 2, 2); + if constexpr (KAT_S::is_complex) { + U_real(0, 0) = U_h(0, 0).real(); + U_real(0, 1) = U_h(0, 1).real(); + U_real(1, 0) = U_h(1, 0).real(); + U_real(1, 1) = U_h(1, 1).real(); + + Vt_real(0, 0) = Vt_h(0, 0).real(); + Vt_real(0, 1) = Vt_h(0, 1).real(); + Vt_real(1, 0) = Vt_h(1, 0).real(); + Vt_real(1, 1) = Vt_h(1, 1).real(); + } else { + U_real(0, 0) = U_h(0, 0); + U_real(0, 1) = U_h(0, 1); + U_real(1, 0) = U_h(1, 0); + U_real(1, 1) = U_h(1, 1); + + Vt_real(0, 0) = Vt_h(0, 0); + Vt_real(0, 1) = Vt_h(0, 1); + Vt_real(1, 0) = Vt_h(1, 0); + Vt_real(1, 1) = Vt_h(1, 1); + } + + const mag_type tol = 100 * KAT_S::eps(); + const mag_type one_sqrt10 = static_cast(1 / Kokkos::sqrt(10)); + const mag_type one_sqrt2 = static_cast(1 / Kokkos::sqrt(2)); + + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 0)), one_sqrt10, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 1)), 3 * one_sqrt10, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 0)), 3 * one_sqrt10, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 1)), one_sqrt10, tol); + + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 1)), one_sqrt2, tol); + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + check_triple_product(Aref, S, U, Vt, tol); + + return 0; +} + +template +int impl_analytic_2x3_svd() { + using scalar_type = typename AMatrix::value_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + using vector_type = + Kokkos::View; + using KAT_S = Kokkos::ArithTraits; + + const mag_type tol = 100 * KAT_S::eps(); + + AMatrix A("A", 2, 3), U("U", 2, 2), Vt("Vt", 3, 3), Aref("A ref", 2, 3); + vector_type S("S", 2); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + + // A = [3 2 2] + // [2 3 -2] + // USVt = 1/sqrt(2) [1 1] * [5 0 0] * 1/(3*sqrt(2)) [ 3 3 0] + // [1 -1] [0 3 0] [ 1 -1 4] + // [2*sqrt(2) -2*sqrt(2) + // -sqrt(2)] + A_h(0, 0) = 3; + A_h(0, 1) = 2; + A_h(0, 2) = 2; + A_h(1, 0) = 2; + A_h(1, 1) = 3; + A_h(1, 2) = -2; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(Aref, A_h); + + try { + KokkosLapack::svd("A", "A", A, S, U, Vt); + } catch (const std::runtime_error& e) { + std::string test_string = e.what(); + std::string cusolver_m_less_than_n = + "CUSOLVER does not support SVD for matrices with more columns " + "than rows, you can transpose you matrix first then compute " + "SVD of that transpose: At=VSUt, and swap the output U and Vt" + " and transpose them to recover the desired SVD."; + + if (test_string == cusolver_m_less_than_n) { + return 0; + } + } + // Don't really need to fence here as we deep_copy right after... + + typename vector_type::HostMirror S_h = Kokkos::create_mirror_view(S); + Kokkos::deep_copy(S_h, S); + typename AMatrix::HostMirror U_h = Kokkos::create_mirror_view(U); + Kokkos::deep_copy(U_h, U); + typename AMatrix::HostMirror Vt_h = Kokkos::create_mirror_view(Vt); + Kokkos::deep_copy(Vt_h, Vt); + + // The singular values for this problem + // are known: sqrt(45) and sqrt(5) + EXPECT_NEAR_KK_REL(S_h(0), static_cast(5), tol); + EXPECT_NEAR_KK_REL(S_h(1), static_cast(3), tol); + + // Both rotations and reflections are valid + // vector basis so we need to check both signs + // to confirm proper SVD was achieved. + Kokkos::View U_real("U real", 2, 2), + Vt_real("Vt real", 3, 3); + if constexpr (KAT_S::is_complex) { + U_real(0, 0) = U_h(0, 0).real(); + U_real(0, 1) = U_h(0, 1).real(); + U_real(1, 0) = U_h(1, 0).real(); + U_real(1, 1) = U_h(1, 1).real(); + + Vt_real(0, 0) = Vt_h(0, 0).real(); + Vt_real(0, 1) = Vt_h(0, 1).real(); + Vt_real(0, 2) = Vt_h(0, 2).real(); + Vt_real(1, 0) = Vt_h(1, 0).real(); + Vt_real(1, 1) = Vt_h(1, 1).real(); + Vt_real(1, 2) = Vt_h(1, 2).real(); + Vt_real(2, 0) = Vt_h(2, 0).real(); + Vt_real(2, 1) = Vt_h(2, 1).real(); + Vt_real(2, 2) = Vt_h(2, 2).real(); + } else { + U_real(0, 0) = U_h(0, 0); + U_real(0, 1) = U_h(0, 1); + U_real(1, 0) = U_h(1, 0); + U_real(1, 1) = U_h(1, 1); + + Vt_real(0, 0) = Vt_h(0, 0); + Vt_real(0, 1) = Vt_h(0, 1); + Vt_real(0, 2) = Vt_h(0, 2); + Vt_real(1, 0) = Vt_h(1, 0); + Vt_real(1, 1) = Vt_h(1, 1); + Vt_real(1, 2) = Vt_h(1, 2); + Vt_real(2, 0) = Vt_h(2, 0); + Vt_real(2, 1) = Vt_h(2, 1); + Vt_real(2, 2) = Vt_h(2, 2); + } + + const mag_type one_sqrt2 = static_cast(1 / Kokkos::sqrt(2)); + const mag_type one_sqrt18 = static_cast(1 / Kokkos::sqrt(18)); + const mag_type one_third = static_cast(1. / 3.); + + // Check values of U + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 1)), one_sqrt2, tol); + + // Check values of Vt + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK(Kokkos::abs(Vt_real(0, 2)), 0, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 0)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 1)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 2)), 4 * one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(2, 0)), 2 * one_third, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(2, 1)), 2 * one_third, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(2, 2)), one_third, tol); + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + check_triple_product(Aref, S, U, Vt, tol); + + return 0; +} + +template +int impl_analytic_3x2_svd() { + using scalar_type = typename AMatrix::value_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + using vector_type = + Kokkos::View; + using KAT_S = Kokkos::ArithTraits; + + const mag_type tol = 100 * KAT_S::eps(); + + AMatrix A("A", 3, 2), U("U", 3, 3), Vt("Vt", 2, 2), Aref("A ref", 3, 2); + vector_type S("S", 2); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + + // Note this is simply the transpose of the 2x3 matrix in the test above + // A = [3 2] + // [2 3] + // [2 -2] + // USVt = 1/(3*sqrt(2)) [3 1 2*sqrt(2)] * [5 0] * 1/sqrt(2) [1 1] + // [3 -1 -2*sqrt(2)] [0 3] [1 -1] + // [0 4 sqrt(2)] [0 0] + A_h(0, 0) = 3; + A_h(0, 1) = 2; + A_h(1, 0) = 2; + A_h(1, 1) = 3; + A_h(2, 0) = 2; + A_h(2, 1) = -2; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(Aref, A_h); + + KokkosLapack::svd("A", "A", A, S, U, Vt); + // Don't really need to fence here as we deep_copy right after... + + typename vector_type::HostMirror S_h = Kokkos::create_mirror_view(S); + Kokkos::deep_copy(S_h, S); + typename AMatrix::HostMirror U_h = Kokkos::create_mirror_view(U); + Kokkos::deep_copy(U_h, U); + typename AMatrix::HostMirror Vt_h = Kokkos::create_mirror_view(Vt); + Kokkos::deep_copy(Vt_h, Vt); + + // The singular values for this problem + // are known: sqrt(45) and sqrt(5) + EXPECT_NEAR_KK_REL(S_h(0), static_cast(5), tol); + EXPECT_NEAR_KK_REL(S_h(1), static_cast(3), tol); + + // Both rotations and reflections are valid + // vector basis so we need to check both signs + // to confirm proper SVD was achieved. + Kokkos::View U_real("U real", 3, 3), + Vt_real("Vt real", 2, 2); + if constexpr (KAT_S::is_complex) { + U_real(0, 0) = U_h(0, 0).real(); + U_real(0, 1) = U_h(0, 1).real(); + U_real(0, 2) = U_h(0, 2).real(); + U_real(1, 0) = U_h(1, 0).real(); + U_real(1, 1) = U_h(1, 1).real(); + U_real(1, 2) = U_h(1, 2).real(); + U_real(2, 0) = U_h(2, 0).real(); + U_real(2, 1) = U_h(2, 1).real(); + U_real(2, 2) = U_h(2, 2).real(); + + Vt_real(0, 0) = Vt_h(0, 0).real(); + Vt_real(0, 1) = Vt_h(0, 1).real(); + Vt_real(1, 0) = Vt_h(1, 0).real(); + Vt_real(1, 1) = Vt_h(1, 1).real(); + } else { + U_real(0, 0) = U_h(0, 0); + U_real(0, 1) = U_h(0, 1); + U_real(0, 2) = U_h(0, 2); + U_real(1, 0) = U_h(1, 0); + U_real(1, 1) = U_h(1, 1); + U_real(1, 2) = U_h(1, 2); + U_real(2, 0) = U_h(2, 0); + U_real(2, 1) = U_h(2, 1); + U_real(2, 2) = U_h(2, 2); + + Vt_real(0, 0) = Vt_h(0, 0); + Vt_real(0, 1) = Vt_h(0, 1); + Vt_real(1, 0) = Vt_h(1, 0); + Vt_real(1, 1) = Vt_h(1, 1); + } + + const mag_type one_sqrt2 = static_cast(1 / Kokkos::sqrt(2)); + const mag_type one_sqrt18 = static_cast(1 / Kokkos::sqrt(18)); + const mag_type one_third = static_cast(1. / 3.); + + // Check values of U + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 1)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 2)), 2 * one_third, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 1)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 2)), 2 * one_third, tol); + EXPECT_NEAR_KK(Kokkos::abs(U_real(2, 0)), 0, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(2, 1)), 4 * one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(2, 2)), one_third, tol); + + // Check values of Vt + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 1)), one_sqrt2, tol); + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + check_triple_product(Aref, S, U, Vt, tol); + + return 0; +} + +template +int impl_test_svd(const int m, const int n) { + using execution_space = typename Device::execution_space; + using scalar_type = typename AMatrix::value_type; + using KAT_S = Kokkos::ArithTraits; + using mag_type = typename KAT_S::mag_type; + using vector_type = + Kokkos::View; + + std::cout << "Running impl_test_svd with sizes: " << m << "x" << n + << std::endl; + + const mag_type tol = 1000 * KAT_S::eps(); + + AMatrix A("A", m, n), U("U", m, m), Vt("Vt", n, n), Aref("A ref", m, n); + vector_type S("S", Kokkos::min(m, n)); + + const uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + + // Initialize A with random numbers + scalar_type randStart = 0, randEnd = 0; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::deep_copy(Aref, A); + + KokkosLapack::svd("A", "A", A, S, U, Vt); + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + // For larger sizes with the triple product + // we accumulate a bit more error apparently? + check_triple_product(Aref, S, U, Vt, 100 * Kokkos::max(m, n) * tol); + + return 0; +} + +} // namespace Test + +template +int test_svd() { + int ret; + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_left = + Kokkos::View; + + ret = Test::impl_analytic_2x2_svd(); + + ret = Test::impl_analytic_2x3_svd(); + + ret = Test::impl_test_svd(0, 0); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(1, 1); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(15, 15); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 100); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 70); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(70, 100); + EXPECT_EQ(ret, 0); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_right = + Kokkos::View; + + ret = Test::impl_analytic_2x2_svd(); + + ret = Test::impl_analytic_2x3_svd(); + + ret = Test::impl_test_svd(0, 0); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(1, 1); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(15, 15); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 100); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 70); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(70, 100); + EXPECT_EQ(ret, 0); +#endif + + return 1; +} + +template +int test_svd_wrapper() { +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ + defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + if constexpr (std::is_same_v) { + // Using a device side space with LAPACK/MKL + return test_svd(); + } +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + if constexpr (std::is_same_v) { + // Using a Cuda device with CUSOLVER + return test_svd(); + } +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER) + if constexpr (std::is_same_v) { + // Using a HIP device with ROCSOLVER + return test_svd(); + } +#endif + + std::cout << "No TPL support enabled, svd is not tested" << std::endl; + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_float) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_float"); + test_svd_wrapper(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_double) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_double"); + test_svd_wrapper(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_complex_float) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_float"); + test_svd_wrapper, TestDevice>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_complex_double) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_double"); + test_svd_wrapper, TestDevice>(); + Kokkos::Profiling::popRegion(); +} +#endif From 4315f9a03e5c0a661427506618ccc6860dfd1147 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 6 Feb 2024 10:34:19 -0500 Subject: [PATCH 151/326] Hands off namespace `Kokkos::Impl` - cleanup couple violations that snuck in (#2094) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Do not use things from namespace Kokkos::Impl (Kokkos::{Impl:: -> }ALL_t) * Do not use things from namespace Kokkos::Impl (Kokkos::Impl::DeepCopy) Can achieve the same with Kokkos::deep_copy * Fix warning `declaration of ‘std::size_t n’ shadows a parameter` --- batched/KokkosBatched_Util.hpp | 5 ++--- sparse/impl/KokkosSparse_coo2crs_impl.hpp | 9 +++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 9078281e59..71c40482d6 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -629,8 +629,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, #if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::Impl::ALL_t i2, - Kokkos::Impl::ALL_t i3, + Kokkos::ALL_t i2, Kokkos::ALL_t i3, const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); @@ -674,7 +673,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( #if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, + ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); diff --git a/sparse/impl/KokkosSparse_coo2crs_impl.hpp b/sparse/impl/KokkosSparse_coo2crs_impl.hpp index d00a6f34a9..f11032903d 100644 --- a/sparse/impl/KokkosSparse_coo2crs_impl.hpp +++ b/sparse/impl/KokkosSparse_coo2crs_impl.hpp @@ -196,8 +196,13 @@ class Coo2Crs { reinterpret_cast(Kokkos::kokkos_malloc( "m_umaps", m_nrows * sizeof(UmapType))); - using shallow_copy_to_device = - Kokkos::Impl::DeepCopy; + auto shallow_copy_to_device = [](UmapType *dst, UmapType const *src, + std::size_t cnt) { + std::size_t nn = cnt / sizeof(UmapType); + Kokkos::deep_copy( + Kokkos::View(dst, nn), + Kokkos::View(src, nn)); + }; UmapType **umap_ptrs = new UmapType *[m_nrows]; // TODO: use host-level parallel_for with tag rowmapRp1 From 8a99aaacf344126e136c10fbca95ca9dd5da95fc Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Wed, 31 Jan 2024 14:04:11 -0500 Subject: [PATCH 152/326] Change name of yaml-cpp to yamlcpp --- cmake/Dependencies.cmake | 2 +- perf_test/performance/CMakeLists.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 13223259ef..9ac02b06f6 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,7 +1,7 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES Kokkos LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE CUSOLVER ROCBLAS ROCSPARSE - TEST_OPTIONAL_TPLS yaml-cpp + TEST_OPTIONAL_TPLS yamlcpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in # the macro 'KOKKOSKERNELS_ADD_TPL_OPTION' that resides in cmake/kokkoskernels_tpls.cmake. diff --git a/perf_test/performance/CMakeLists.txt b/perf_test/performance/CMakeLists.txt index 93d377ba60..f282debe84 100644 --- a/perf_test/performance/CMakeLists.txt +++ b/perf_test/performance/CMakeLists.txt @@ -7,9 +7,9 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) # performance_example is a simple example of using it. #don't assert that this is defined anymore -#ASSERT_DEFINED(TPL_ENABLE_yaml-cpp) +#ASSERT_DEFINED(TPL_ENABLE_yamlcpp) -IF(TPL_ENABLE_yaml-cpp) +IF(TPL_ENABLE_yamlcpp) KOKKOSKERNELS_ADD_UNIT_TEST( performance_validate From d77f1143ca65744fc7c119d340848b96bdbba658 Mon Sep 17 00:00:00 2001 From: Caleb Schilly Date: Mon, 5 Feb 2024 16:52:42 -0500 Subject: [PATCH 153/326] Fix macro setting in CMakeLists --- perf_test/performance/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/performance/CMakeLists.txt b/perf_test/performance/CMakeLists.txt index f282debe84..601b33256c 100644 --- a/perf_test/performance/CMakeLists.txt +++ b/perf_test/performance/CMakeLists.txt @@ -9,7 +9,7 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) #don't assert that this is defined anymore #ASSERT_DEFINED(TPL_ENABLE_yamlcpp) -IF(TPL_ENABLE_yamlcpp) +IF(${PACKAGE_NAME}_ENABLE_yamlcpp) KOKKOSKERNELS_ADD_UNIT_TEST( performance_validate From 49c43085d45365e8bd3343d4fffe66809a905c10 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 6 Feb 2024 15:19:02 -0700 Subject: [PATCH 154/326] GMRES: Add support for BSR matrices Also, add a test for this. --- sparse/impl/KokkosSparse_gmres_impl.hpp | 2 +- sparse/impl/KokkosSparse_gmres_spec.hpp | 19 ++- sparse/src/KokkosSparse_gmres.hpp | 28 ++- sparse/unit_test/Test_Sparse_gmres.hpp | 217 ++++++++++++++---------- 4 files changed, 169 insertions(+), 97 deletions(-) diff --git a/sparse/impl/KokkosSparse_gmres_impl.hpp b/sparse/impl/KokkosSparse_gmres_impl.hpp index 8c7231f90c..f616bfe8f3 100644 --- a/sparse/impl/KokkosSparse_gmres_impl.hpp +++ b/sparse/impl/KokkosSparse_gmres_impl.hpp @@ -70,7 +70,7 @@ struct GmresWrap { Kokkos::Profiling::pushRegion("GMRES::TotalTime:"); // Store solver options: - const auto n = A.numRows(); + const auto n = A.numPointRows(); const int m = thandle.get_m(); const auto maxRestart = thandle.get_max_restart(); const auto tol = thandle.get_tol(); diff --git a/sparse/impl/KokkosSparse_gmres_spec.hpp b/sparse/impl/KokkosSparse_gmres_spec.hpp index bfe1c4539a..a588793ff8 100644 --- a/sparse/impl/KokkosSparse_gmres_spec.hpp +++ b/sparse/impl/KokkosSparse_gmres_spec.hpp @@ -23,6 +23,7 @@ #include #include #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_BsrMatrix.hpp" #include "KokkosKernels_Handle.hpp" // Include the actual functors @@ -81,10 +82,15 @@ template ::value> struct GMRES { - using AMatrix = CrsMatrix; + using AMatrix = CrsMatrix; + using BAMatrix = KokkosSparse::Experimental::BsrMatrix; static void gmres( KernelHandle *handle, const AMatrix &A, const BType &B, XType &X, KokkosSparse::Experimental::Preconditioner *precond = nullptr); + + static void gmres( + KernelHandle *handle, const BAMatrix &A, const BType &B, XType &X, + KokkosSparse::Experimental::Preconditioner *precond = nullptr); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -104,6 +110,17 @@ struct GMRES; + static void gmres( + KernelHandle *handle, const BAMatrix &A, const BType &B, XType &X, + KokkosSparse::Experimental::Preconditioner *precond = nullptr) { + auto gmres_handle = handle->get_gmres_handle(); + using Gmres = Experimental::GmresWrap< + typename std::remove_pointer::type>; + + Gmres::gmres(*gmres_handle, A, B, X, precond); + } }; #endif diff --git a/sparse/src/KokkosSparse_gmres.hpp b/sparse/src/KokkosSparse_gmres.hpp index 31b736c393..b0b708a330 100644 --- a/sparse/src/KokkosSparse_gmres.hpp +++ b/sparse/src/KokkosSparse_gmres.hpp @@ -89,8 +89,9 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, "gmres: A size type must match KernelHandle entry " "type (aka size_type, and const doesn't matter)"); - static_assert(KokkosSparse::is_crs_matrix::value, - "gmres: A is not a CRS matrix."); + static_assert(KokkosSparse::is_crs_matrix::value || + KokkosSparse::Experimental::is_bsr_matrix::value, + "gmres: A is not a CRS or BSR matrix."); static_assert(Kokkos::is_view::value, "gmres: B is not a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -120,8 +121,10 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; if ((X.extent(0) != B.extent(0)) || - (static_cast(A.numCols()) != static_cast(X.extent(0))) || - (static_cast(A.numRows()) != static_cast(B.extent(0)))) { + (static_cast(A.numPointCols()) != + static_cast(X.extent(0))) || + (static_cast(A.numPointRows()) != + static_cast(B.extent(0)))) { std::ostringstream os; os << "KokkosSparse::gmres: Dimensions do not match: " << ", A: " << A.numRows() << " x " << A.numCols() @@ -135,11 +138,20 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, const_handle_type tmp_handle(*handle); - using AMatrix_Internal = KokkosSparse::CrsMatrix< + using AMatrix_Bsr_Internal = KokkosSparse::Experimental::BsrMatrix< typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, typename AMatrix::device_type, Kokkos::MemoryTraits, typename AMatrix::const_size_type>; + using AMatrix_Internal = std::conditional_t< + KokkosSparse::is_crs_matrix::value, + KokkosSparse::CrsMatrix, + typename AMatrix::const_size_type>, + AMatrix_Bsr_Internal>; + using B_Internal = Kokkos::View< typename BType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, @@ -154,9 +166,9 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, using Precond_Internal = Preconditioner; - AMatrix_Internal A_i = A; - B_Internal b_i = B; - X_Internal x_i = X; + AMatrix_Internal A_i(A); + B_Internal b_i = B; + X_Internal x_i = X; Precond_Internal* precond_i = reinterpret_cast(precond); diff --git a/sparse/unit_test/Test_Sparse_gmres.hpp b/sparse/unit_test/Test_Sparse_gmres.hpp index 1990087526..7b624c7f75 100644 --- a/sparse/unit_test/Test_Sparse_gmres.hpp +++ b/sparse/unit_test/Test_Sparse_gmres.hpp @@ -48,120 +48,163 @@ struct TolMeta { static constexpr float value = 1e-5; // Lower tolerance for floats }; +template ::value>::type* = nullptr> +AType get_A(int n, int diagDominance, int) { + using lno_t = typename Crs::ordinal_type; + typename Crs::non_const_size_type nnz = 10 * n; + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + n, n, nnz, 0, lno_t(0.01 * n), diagDominance); + KokkosSparse::sort_crs_matrix(A); + + return A; +} + +template ::value>::type* = nullptr> +AType get_A(int n, int diagDominance, int block_size) { + using lno_t = typename Crs::ordinal_type; + typename Crs::non_const_size_type nnz = 10 * n; + auto A_unblocked = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + n, n, nnz, 0, lno_t(0.01 * n), diagDominance); + KokkosSparse::sort_crs_matrix(A_unblocked); + + // Convert to BSR + AType A(A_unblocked, block_size); + + return A; +} + template -void run_test_gmres() { - using exe_space = typename device::execution_space; - using mem_space = typename device::memory_space; - using sp_matrix_type = - KokkosSparse::CrsMatrix; +struct GmresTest { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using AT = Kokkos::ArithTraits; + using exe_space = typename device::execution_space; + using mem_space = typename device::memory_space; + + using Crs = CrsMatrix; + using Bsr = BsrMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; using float_t = typename Kokkos::ArithTraits::mag_type; - // Create a diagonally dominant sparse matrix to test: - constexpr auto n = 5000; - constexpr auto m = 15; - constexpr auto tol = TolMeta::value; - constexpr auto numRows = n; - constexpr auto numCols = n; - constexpr auto diagDominance = 1; - constexpr bool verbose = false; - - typename sp_matrix_type::non_const_size_type nnz = 10 * numRows; - auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< - sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), - diagDominance); - - // Make kernel handles - KernelHandle kh; - kh.create_gmres_handle(m, tol); - auto gmres_handle = kh.get_gmres_handle(); - using GMRESHandle = - typename std::remove_reference::type; - using ViewVectorType = typename GMRESHandle::nnz_value_view_t; - - // Set initial vectors: - ViewVectorType X("X", n); // Solution and initial guess - ViewVectorType Wj("Wj", n); // For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), - n); // right-hand side vec - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B, 1.0); - - gmres_handle->set_verbose(verbose); - - // Test CGS2 - { - gmres(&kh, A, B, X); - - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; - - const auto conv_flag = gmres_handle->get_conv_flag_val(); - - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - } + template + static void run_test_gmres() { + using sp_matrix_type = std::conditional_t; + + // Create a diagonally dominant sparse matrix to test: + constexpr auto n = 5000; + constexpr auto m = 15; + constexpr auto tol = TolMeta::value; + constexpr auto diagDominance = 1; + constexpr bool verbose = true; + constexpr auto block_size = UseBlocks ? 10 : 1; + + auto A = get_A(n, diagDominance, block_size); + + if (verbose) { + std::cout << "Running GMRES test with block_size=" << block_size + << std::endl; + } + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + using GMRESHandle = + typename std::remove_reference::type; + using ViewVectorType = typename GMRESHandle::nnz_value_view_t; + + // Set initial vectors: + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); - // Test MGS - { - gmres_handle->reset_handle(m, tol); - gmres_handle->set_ortho(GMRESHandle::Ortho::MGS); gmres_handle->set_verbose(verbose); - // reset X for next gmres call - Kokkos::deep_copy(X, 0.0); + // Test CGS2 + { + gmres(&kh, A, B, X); - gmres(&kh, A, B, X); + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; + const auto conv_flag = gmres_handle->get_conv_flag_val(); - const auto conv_flag = gmres_handle->get_conv_flag_val(); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - } + // Test MGS + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_ortho(GMRESHandle::Ortho::MGS); + gmres_handle->set_verbose(verbose); - // Test GSS2 with simple preconditioner - { - gmres_handle->reset_handle(m, tol); - gmres_handle->set_verbose(verbose); + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); + + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); - // Make precond - KokkosSparse::Experimental::MatrixPrec myPrec(A); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } - // reset X for next gmres call - Kokkos::deep_copy(X, 0.0); + // Test GSS2 with simple preconditioner + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); - gmres(&kh, A, B, X, &myPrec); + // Make precond + KokkosSparse::Experimental::MatrixPrec myPrec(A); - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); - const auto conv_flag = gmres_handle->get_conv_flag_val(); + gmres(&kh, A, B, X, &myPrec); - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } } -} +}; } // namespace Test template void test_gmres() { - Test::run_test_gmres(); + using TestStruct = Test::GmresTest; + TestStruct::template run_test_gmres(); + TestStruct::template run_test_gmres(); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From adb3064e6dfbd837dd433a51c7efc5fe7c2ef4ad Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 8 Feb 2024 09:56:57 -0500 Subject: [PATCH 155/326] Remove all mentions of HBWSpace --- cm_generate_makefile.bash | 18 ++---------------- cmake/kokkoskernels_eti_devices.cmake | 12 ------------ .../sparse/KokkosSparse_spgemm_jacobi.cpp | 7 ------- 3 files changed, 2 insertions(+), 35 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 426827db00..e872789c72 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -327,7 +327,6 @@ display_help_text() { echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -494,10 +493,6 @@ do KOKKOS_HWLOC=ON HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - KOKKOS_MEMKIND=ON - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -717,15 +712,6 @@ else KOKKOS_HWLOC_CMD= fi -if [ "$KOKKOS_MEMKIND" == "ON" ]; then - KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON - if [ "$MEMKIND_PATH" != "" ]; then - KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH - fi -else - KOKKOS_MEMKIND_CMD= -fi - # Currently assumes script is in base kokkos-kernels directory if [ ! -e ${KOKKOSKERNELS_PATH}/CMakeLists.txt ]; then @@ -818,9 +804,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index 8bd131f2a4..b617624a36 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -28,7 +28,6 @@ SET(MEM_SPACES MEMSPACE_SYCLSHAREDSPACE MEMSPACE_OPENMPTARGET MEMSPACE_HOSTSPACE - MEMSPACE_HBWSPACE ) SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) @@ -38,7 +37,6 @@ SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) SET(MEMSPACE_OPENMPTARGETSPACE_CPP_TYPE Kokkos::Experimental::OpenMPTargetSpace) SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) -SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) IF(KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_OPTION( @@ -163,13 +161,6 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to pre instantiate kernels for the memory space Kokkos::HostSpace. Disabling this when one of the Host execution spaces is enabled may increase build times. Default: ON" ) -KOKKOSKERNELS_ADD_OPTION( - INST_MEMSPACE_HBWSPACE - OFF - BOOL - "Whether to pre instantiate kernels for the memory space Kokkos::HBWSpace." -) - KOKKOSKERNELS_ADD_OPTION( INST_EXECSPACE_OPENMP ${KOKKOSKERNELS_INST_EXECSPACE_OPENMP_DEFAULT} @@ -211,9 +202,6 @@ SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE HIPMANAGEDSPACE) SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_OPENMPTARGET_VALID_MEM_SPACES OPENMPTARGETSPACE) -SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) -SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) -SET(EXECSPACE_THREADS_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(DEVICES) FOREACH(EXEC ${EXEC_SPACES}) IF (KOKKOSKERNELS_INST_${EXEC}) diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index 0f705e1209..33cb8a0f5f 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -237,17 +237,10 @@ int main(int argc, char** argv) { Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) - if (params.use_openmp) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_HBWSPACE - KokkosKernels::Experiment::run_spgemm_jacobi< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params); -#else KokkosKernels::Experiment::run_spgemm_jacobi< size_type, lno_t, scalar_t, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params); -#endif } #endif From 2ae345291b1158845b962a9ffdeb6ab3956f3e0f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 8 Feb 2024 12:26:27 -0700 Subject: [PATCH 156/326] Reintroduce EXECSPACE_(SERIAL,OPENMP,THREADS}_VALID_MEM_SPACES Drop HBWSPACE as an option --- cmake/kokkoskernels_eti_devices.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index b617624a36..8c38be098c 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -202,6 +202,9 @@ SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE HIPMANAGEDSPACE) SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_OPENMPTARGET_VALID_MEM_SPACES OPENMPTARGETSPACE) +SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HOSTSPACE) +SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HOSTSPACE) +SET(EXECSPACE_THREADS_VALID_MEM_SPACES HOSTSPACE) SET(DEVICES) FOREACH(EXEC ${EXEC_SPACES}) IF (KOKKOSKERNELS_INST_${EXEC}) From 401f6c28c94ee2bbca44b2c5a9a2d81dc3d7ad56 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 6 Feb 2024 11:01:40 -0700 Subject: [PATCH 157/326] Lapack: adding svd benchmark Fixing unit-test for CUSOLVER and adding benchmark to check the algorithm performance on various platforms. --- lapack/unit_test/Test_Lapack_svd.hpp | 17 ++- perf_test/CMakeLists.txt | 1 + perf_test/lapack/CMakeLists.txt | 8 ++ .../lapack/KokkosLapack_SVD_benchmark.cpp | 124 ++++++++++++++++++ 4 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 perf_test/lapack/CMakeLists.txt create mode 100644 perf_test/lapack/KokkosLapack_SVD_benchmark.cpp diff --git a/lapack/unit_test/Test_Lapack_svd.hpp b/lapack/unit_test/Test_Lapack_svd.hpp index a0a1f31ab0..032b9f86c6 100644 --- a/lapack/unit_test/Test_Lapack_svd.hpp +++ b/lapack/unit_test/Test_Lapack_svd.hpp @@ -477,7 +477,8 @@ int impl_test_svd(const int m, const int n) { std::cout << "Running impl_test_svd with sizes: " << m << "x" << n << std::endl; - const mag_type tol = 1000 * KAT_S::eps(); + const mag_type max_val = 10; + const mag_type tol = 1000 * max_val * KAT_S::eps(); AMatrix A("A", m, n), U("U", m, m), Vt("Vt", n, n), Aref("A ref", m, n); vector_type S("S", Kokkos::min(m, n)); @@ -488,11 +489,23 @@ int impl_test_svd(const int m, const int n) { // Initialize A with random numbers scalar_type randStart = 0, randEnd = 0; - Test::getRandomBounds(10.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(A, rand_pool, randStart, randEnd); Kokkos::deep_copy(Aref, A); + // Working around CUSOLVER constraint for m >= n +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + if constexpr (std::is_same_v) { + if (m >= n) { + KokkosLapack::svd("A", "A", A, S, U, Vt); + } + } else { + KokkosLapack::svd("A", "A", A, S, U, Vt); + } +#else KokkosLapack::svd("A", "A", A, S, U, Vt); +#endif check_unitary_orthogonal_matrix(U, tol); check_unitary_orthogonal_matrix(Vt, tol); diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index cf1905d6d4..28271dfb0d 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -49,6 +49,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) ADD_COMPONENT_SUBDIRECTORY(sparse) ADD_COMPONENT_SUBDIRECTORY(blas) ADD_COMPONENT_SUBDIRECTORY(ode) + ADD_COMPONENT_SUBDIRECTORY(lapack) ADD_SUBDIRECTORY(performance) #ADD_SUBDIRECTORY(common) diff --git a/perf_test/lapack/CMakeLists.txt b/perf_test/lapack/CMakeLists.txt new file mode 100644 index 0000000000..478703d38a --- /dev/null +++ b/perf_test/lapack/CMakeLists.txt @@ -0,0 +1,8 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +if(KOKKOSKERNELS_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + lapack_svd SOURCES KokkosLapack_SVD_benchmark.cpp + ) +endif() diff --git a/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp b/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp new file mode 100644 index 0000000000..1ac9381ff8 --- /dev/null +++ b/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp @@ -0,0 +1,124 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosLapack_svd.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include +#include "Benchmark_Context.hpp" + +struct svd_parameters { + int numRows, numCols; + bool verbose; + + svd_parameters(const int numRows_, const int numCols_, const bool verbose_) + : numRows(numRows_), numCols(numCols_), verbose(verbose_){}; +}; + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\t[Optional] --m :: number of rows of A" << std::endl; + std::cerr << "\t[Optional] --n :: number of columns of A" + << std::endl; +} // print_options + +int parse_inputs(svd_parameters& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.numRows)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.numCols)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +template +void run_svd_benchmark(benchmark::State& state, + const svd_parameters& svd_params) { + using mat_type = Kokkos::View; + using vec_type = Kokkos::View; + + const int m = svd_params.numRows; + const int n = svd_params.numCols; + + mat_type A("A", m, n), U("U", m, m), Vt("Vt", n, n); + vec_type S("S", Kokkos::min(m, n)); + + const uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + + // Initialize A with random numbers + double randStart = 0, randEnd = 0; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + + for (auto _ : state) { + (void)_; + KokkosLapack::svd("A", "A", A, S, U, Vt); + Kokkos::fence(); + } +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + svd_parameters svd_params(0, 0, false); + parse_inputs(svd_params, argc, argv); + + std::string bench_name = "KokkosLapack_SVD"; + + if (0 < common_params.repeat) { + benchmark::RegisterBenchmark( + bench_name.c_str(), run_svd_benchmark, + svd_params) + ->UseRealTime() + ->Iterations(common_params.repeat); + } else { + benchmark::RegisterBenchmark( + bench_name.c_str(), run_svd_benchmark, + svd_params) + ->UseRealTime(); + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + + return 0; +} From c1beeebc8c6f15911b4feb53011775ab9f086fc2 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Fri, 9 Feb 2024 14:37:07 -0700 Subject: [PATCH 158/326] Fix Cuda TPL finding (#2098) - Allow finding cusparse, cusolver based on manually provided paths - This is necessary when using an nvhpc toolchain instead of a standard cuda toolchain - Set header paths correctly (this is redundant in a cuda installation, in which $CUDA_ROOT/include is already a system include dir, but needed in other cases) --- cmake/Modules/FindTPLCUBLAS.cmake | 16 +++++--- cmake/Modules/FindTPLCUSOLVER.cmake | 59 +++++++++++++++++++++-------- cmake/Modules/FindTPLCUSPARSE.cmake | 59 +++++++++++++++++++++-------- 3 files changed, 98 insertions(+), 36 deletions(-) diff --git a/cmake/Modules/FindTPLCUBLAS.cmake b/cmake/Modules/FindTPLCUBLAS.cmake index feb39d0373..164f3bf4c4 100644 --- a/cmake/Modules/FindTPLCUBLAS.cmake +++ b/cmake/Modules/FindTPLCUBLAS.cmake @@ -1,5 +1,5 @@ if(CUBLAS_LIBRARIES AND CUBLAS_LIBRARY_DIRS AND CUBLAS_INCLUDE_DIRS) - kokkoskernels_find_imported(CUBLAS INTERFACE + kokkoskernels_find_imported(CUBLAS INTERFACE LIBRARIES ${CUBLAS_LIBRARIES} LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} HEADER_PATHS ${CUBLAS_INCLUDE_DIRS} @@ -8,19 +8,23 @@ elseif(CUBLAS_LIBRARIES AND CUBLAS_LIBRARY_DIRS) kokkoskernels_find_imported(CUBLAS INTERFACE LIBRARIES ${CUBLAS_LIBRARIES} LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + HEADER cublas.h ) elseif(CUBLAS_LIBRARIES) - kokkoskernels_find_imported(CUBLAS INTERFACE + kokkoskernels_find_imported(CUBLAS INTERFACE LIBRARIES ${CUBLAS_LIBRARIES} + HEADER cublas.h ) elseif(CUBLAS_LIBRARY_DIRS) - kokkoskernels_find_imported(CUBLAS INTERFACE - LIBRARIES cublas + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES cublas LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + HEADER cublas.h ) elseif(CUBLAS_ROOT OR KokkosKernels_CUBLAS_ROOT) # nothing specific provided, just ROOT - kokkoskernels_find_imported(CUBLAS INTERFACE - LIBRARIES cublas + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES cublas + HEADER cublas.h ) else() # backwards-compatible way FIND_PACKAGE(CUDA) diff --git a/cmake/Modules/FindTPLCUSOLVER.cmake b/cmake/Modules/FindTPLCUSOLVER.cmake index 4b75aefd65..3e43639495 100644 --- a/cmake/Modules/FindTPLCUSOLVER.cmake +++ b/cmake/Modules/FindTPLCUSOLVER.cmake @@ -1,17 +1,46 @@ -FIND_PACKAGE(CUDA) - -INCLUDE(FindPackageHandleStandardArgs) -IF (NOT CUDA_FOUND) - #Important note here: this find Module is named TPLCUSOLVER - #The eventual target is named CUSOLVER. To avoid naming conflicts - #the find module is called TPLCUSOLVER. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_FOUND) -ELSE() - #The libraries might be empty - OR they might explicitly be not found - IF("${CUDA_cusolver_LIBRARY}" MATCHES "NOTFOUND") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_cusolver_LIBRARY) +if(CUSOLVER_LIBRARIES AND CUSOLVER_LIBRARY_DIRS AND CUSOLVER_INCLUDE_DIRS) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES ${CUSOLVER_LIBRARIES} + LIBRARY_PATHS ${CUSOLVER_LIBRARY_DIRS} + HEADER_PATHS ${CUSOLVER_INCLUDE_DIRS} + ) +elseif(CUSOLVER_LIBRARIES AND CUSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES ${CUSOLVER_LIBRARIES} + LIBRARY_PATHS ${CUSOLVER_LIBRARY_DIRS} + HEADER cusolverDn.h + ) +elseif(CUSOLVER_LIBRARIES) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES ${CUSOLVER_LIBRARIES} + HEADER cusolverDn.h + ) +elseif(CUSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES cusolver + LIBRARY_PATHS ${CUSOLVER_LIBRARY_DIRS} + HEADER cusolverDn.h + ) +elseif(CUSOLVER_ROOT OR KokkosKernels_CUSOLVER_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES cusolver + HEADER cusolverDn.h + ) +else() # backwards-compatible way + FIND_PACKAGE(CUDA) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUSOLVER + #The eventual target is named CUSOLVER. To avoid naming conflicts + #the find module is called TPLCUSOLVER. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_FOUND) ELSE() - KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSOLVER LIBRARY ${CUDA_cusolver_LIBRARY}) + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_cusolver_LIBRARY}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_cusolver_LIBRARY) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSOLVER INTERFACE LINK_LIBRARIES "${CUDA_cusolver_LIBRARY}") + ENDIF() ENDIF() -ENDIF() +endif() diff --git a/cmake/Modules/FindTPLCUSPARSE.cmake b/cmake/Modules/FindTPLCUSPARSE.cmake index f6e02129ae..6302f85d78 100644 --- a/cmake/Modules/FindTPLCUSPARSE.cmake +++ b/cmake/Modules/FindTPLCUSPARSE.cmake @@ -1,17 +1,46 @@ -FIND_PACKAGE(CUDA) - -INCLUDE(FindPackageHandleStandardArgs) -IF (NOT CUDA_FOUND) - #Important note here: this find Module is named TPLCUSPARSE - #The eventual target is named CUSPARSE. To avoid naming conflicts - #the find module is called TPLCUSPARSE. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_FOUND) -ELSE() - #The libraries might be empty - OR they might explicitly be not found - IF("${CUDA_cusparse_LIBRARY}" MATCHES "NOTFOUND") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_cusparse_LIBRARY) +if(CUSPARSE_LIBRARIES AND CUSPARSE_LIBRARY_DIRS AND CUSPARSE_INCLUDE_DIRS) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES ${CUSPARSE_LIBRARIES} + LIBRARY_PATHS ${CUSPARSE_LIBRARY_DIRS} + HEADER_PATHS ${CUSPARSE_INCLUDE_DIRS} + ) +elseif(CUSPARSE_LIBRARIES AND CUSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES ${CUSPARSE_LIBRARIES} + LIBRARY_PATHS ${CUSPARSE_LIBRARY_DIRS} + HEADER cusparse.h + ) +elseif(CUSPARSE_LIBRARIES) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES ${CUSPARSE_LIBRARIES} + HEADER cusparse.h + ) +elseif(CUSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES cusparse + LIBRARY_PATHS ${CUSPARSE_LIBRARY_DIRS} + HEADER cusparse.h + ) +elseif(CUSPARSE_ROOT OR KokkosKernels_CUSPARSE_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES cusparse + HEADER cusparse.h + ) +else() # backwards-compatible way + FIND_PACKAGE(CUDA) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUSPARSE + #The eventual target is named CUSPARSE. To avoid naming conflicts + #the find module is called TPLCUSPARSE. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_FOUND) ELSE() - KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSPARSE LIBRARY ${CUDA_cusparse_LIBRARY}) + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_cusparse_LIBRARY}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_cusparse_LIBRARY) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSPARSE INTERFACE LINK_LIBRARIES "${CUDA_cusparse_LIBRARY}") + ENDIF() ENDIF() -ENDIF() +endif() From 2ca9cfc102b9987a763432a6c71666f324ae7fd0 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 14 Feb 2024 10:59:36 -0700 Subject: [PATCH 159/326] Add support for BSR matrices to some trsv routines (#2104) * Add support for BSR matrices to some trsv routines * Change trsv to gesv --- sparse/impl/KokkosSparse_trsv_impl.hpp | 1201 ++++++++++++----------- sparse/impl/KokkosSparse_trsv_spec.hpp | 71 +- sparse/src/KokkosSparse_trsv.hpp | 26 +- sparse/unit_test/Test_Sparse_gmres.hpp | 2 +- sparse/unit_test/Test_Sparse_trsv.hpp | 183 ++-- test_common/KokkosKernels_TestUtils.hpp | 11 +- 6 files changed, 815 insertions(+), 679 deletions(-) diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index fbbd547e34..58a6f6f7d8 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -14,15 +14,20 @@ // //@HEADER -#ifndef KOKKOSSPARSE_IMPL_TRSM_HPP_ -#define KOKKOSSPARSE_IMPL_TRSM_HPP_ +#ifndef KOKKOSSPARSE_TRSV_IMPL_HPP_ +#define KOKKOSSPARSE_TRSV_IMPL_HPP_ -/// \file KokkosSparse_impl_trsm.hpp -/// \brief Implementation(s) of sparse triangular solve. +/// \file KokkosSparse_trsv_impl.hpp +/// \brief Implementation(s) of sequential sparse triangular solve. #include #include -#include // temporarily +#include "KokkosBatched_Axpy.hpp" +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBlas2_gemv.hpp" +#include "KokkosBlas1_set.hpp" namespace KokkosSparse { namespace Impl { @@ -30,652 +35,682 @@ namespace Sequential { template -void lowerTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type r = 0; r < numRows; ++r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); +struct TrsvWrap { + using offset_type = + typename CrsMatrixType::row_map_type::non_const_value_type; + using lno_t = typename CrsMatrixType::index_type::non_const_value_type; + using scalar_t = typename CrsMatrixType::values_type::non_const_value_type; + using device_t = typename CrsMatrixType::device_type; + using sview_1d = typename Kokkos::View; + using STS = Kokkos::ArithTraits; + + struct CommonUnblocked { + CommonUnblocked(const lno_t block_size) { + KK_REQUIRE_MSG(block_size == 1, + "Tried to use block_size>1 for non-block-enabled Common"); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current row r - } // for each row r -} -template -void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type r = 0; r < numRows; ++r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); - } + scalar_t zero() { return STS::zero(); } - matrix_scalar_type A_rr = STS::zero(); - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - // FIXME (mfh 28 Aug 2014) This assumes that the diagonal entry - // has equal local row and column indices. That may not - // necessarily hold, depending on the row and column Maps. The - // way to fix this would be for Tpetra::CrsMatrix to remember - // the local column index of the diagonal entry (if there is - // one) in each row, and pass that along to this function. - if (r == c) { - A_rr += A_rc; - } else { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } - } // for each entry A_rc in the current row r - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = X(r, j) / A_rr; + template + scalar_t get(const ValuesView& vals, const offset_type i) { + return vals(i); } - } // for each row r -} -template -void upperTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - // If local_ordinal_type is unsigned and numRows is 0, the loop - // below will have entirely the wrong number of iterations. - if (numRows == 0) { - return; - } + void pluseq(scalar_t& lhs, const scalar_t& rhs) { lhs += rhs; } - // Don't use r >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do r == 0 (last - // iteration) below. - for (local_ordinal_type r = numRows - 1; r != 0; --r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); - } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current row r - } // for each row r - - // Last iteration: r = 0. - { - const local_ordinal_type r = 0; - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); + void gemv(RangeMultiVectorType X, const scalar_t& A, const lno_t r, + const lno_t c, const lno_t j, const char = 'N') { + X(r, j) -= A * X(c, j); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current row r - } // last iteration: r = 0 -} -template -void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - typedef Kokkos::ArithTraits STS; - - // If local_ordinal_type is unsigned and numRows is 0, the loop - // below will have entirely the wrong number of iterations. - if (numRows == 0) { - return; - } - - // Don't use r >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do r == 0 (last - // iteration) below. - for (local_ordinal_type r = numRows - 1; r != 0; --r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); + template + void divide(RangeMultiVectorType X, const scalar_t& A, const lno_t r, + const lno_t j) { + X(r, j) /= A; } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - matrix_scalar_type A_rr = STS::zero(); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - if (r == c) { - A_rr += A_rc; - } else { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } - } // for each entry A_rc in the current row r - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = X(r, j) / A_rr; + }; + + struct CommonBlocked { + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + + using UBlock = Kokkos::View< + scalar_t**, Layout, typename CrsMatrixType::device_type, + Kokkos::MemoryTraits >; + + using Block = + Kokkos::View >; + + using Vector = Kokkos::View >; + + using UVector = Kokkos::View< + scalar_t*, typename CrsMatrixType::device_type, + Kokkos::MemoryTraits >; + + lno_t m_block_size; + lno_t m_block_items; + Vector m_ones; + Block m_data; + Block m_tmp; // Needed for SerialGesv + UBlock m_utmp; // Needed for SerialGesv + Vector m_vec_data1; + Vector m_vec_data2; + + CommonBlocked(const lno_t block_size) + : m_block_size(block_size), + m_block_items(block_size * block_size), + m_ones("ones", block_size), + m_data("m_data", block_size, block_size), + m_tmp("m_tmp", block_size, block_size + 4), + m_utmp(m_tmp.data(), block_size, block_size + 4), + m_vec_data1("m_vec_data1", block_size), + m_vec_data2("m_vec_data2", block_size) { + Kokkos::deep_copy(m_ones, 1.0); } - } // for each row r - // Last iteration: r = 0. - { - const local_ordinal_type r = 0; - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); + UBlock zero() { + UBlock block(m_data.data(), m_block_size, m_block_size); + KokkosBlas::SerialSet::invoke(STS::zero(), block); + return block; } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - matrix_scalar_type A_rr = STS::zero(); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - if (r == c) - A_rr += A_rc; - else { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } - } // for each entry A_rc in the current row r - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = X(r, j) / A_rr; + + template + UBlock get(const ValuesView& vals, const offset_type i) { + scalar_t* data = const_cast(vals.data()); + UBlock rv(data + (i * m_block_items), m_block_size, m_block_size); + return rv; } - } // last iteration: r = 0 -} -template -void upperTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + void pluseq(UBlock& lhs, const UBlock& rhs) { + KokkosBatched::SerialAxpy::invoke(m_ones, rhs, lhs); } - } - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; - } + void gemv(RangeMultiVectorType X, const UBlock& A, const lno_t r, + const lno_t c, const lno_t j, const char transpose = 'N') { + // Create and populate x and y + UVector x(m_vec_data1.data(), m_block_size); + UVector y(m_vec_data2.data(), m_block_size); + for (lno_t b = 0; b < m_block_size; ++b) { + x(b) = X(c * m_block_size + b, j); + y(b) = X(r * m_block_size + b, j); + } + + KokkosBlas::Experimental::serial_gemv(transpose, -1, A, x, 1, y); - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type r = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + for (lno_t b = 0; b < m_block_size; ++b) { + X(r * m_block_size + b, j) = y(b); } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const local_ordinal_type c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type r = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + } + + template + void divide(RangeMultiVectorType X, const UBlock& A, const lno_t r, + const lno_t j) { + UVector x(m_vec_data1.data(), m_block_size); + for (lno_t b = 0; b < m_block_size; ++b) { + x(b) = X(r * m_block_size + b, j); } - } // for each entry A_rc in the current column c - } -} -template -void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); - } - } + // If StaticPivoting is used, there are compiler errors related to + // comparing complex and non-complex. + using Algo = KokkosBatched::Gesv::NoPivoting; - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; + KokkosBatched::SerialGesv::invoke(A, x, x, m_utmp); + + for (lno_t b = 0; b < m_block_size; ++b) { + X(r * m_block_size + b, j) = x(b); + } + } + }; + + using CommonOps = std::conditional_t< + KokkosSparse::Experimental::is_bsr_matrix::value, + CommonBlocked, CommonUnblocked>; + + static void lowerTriSolveCsrUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + for (lno_t r = 0; r < numRows; ++r) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t c = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current row r + } // for each row r } - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = val(k); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { + static void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + CommonOps co(block_size); + + Kokkos::deep_copy(X, Y); + + for (lno_t r = 0; r < numRows; ++r) { + auto A_rr = co.zero(); + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + + for (offset_type k = beg; k < end; ++k) { + const auto A_rc = co.get(val, k); + const lno_t c = ind(k); + // FIXME (mfh 28 Aug 2014) This assumes that the diagonal entry + // has equal local row and column indices. That may not + // necessarily hold, depending on the row and column Maps. The + // way to fix this would be for Tpetra::CrsMatrix to remember + // the local column index of the diagonal entry (if there is + // one) in each row, and pass that along to this function. if (r == c) { - X(c, j) = X(c, j) / A_rc; + co.pluseq(A_rr, A_rc); } else { - X(r, j) -= A_rc * X(c, j); + for (lno_t j = 0; j < numVecs; ++j) { + co.gemv(X, A_rc, r, c, j); + } } + } // for each entry A_rc in the current row r + for (lno_t j = 0; j < numVecs; ++j) { + co.template divide(X, A_rr, r, j); } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const matrix_scalar_type A_rc = val(beg); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; - } + } // for each row r } -} -template -void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCsrUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + // If lno_t is unsigned and numRows is 0, the loop + // below will have entirely the wrong number of iterations. + if (numRows == 0) { + return; } + + // Don't use r >= 0 as the test, because that fails if + // lno_t is unsigned. We do r == 0 (last + // iteration) below. + for (lno_t r = numRows - 1; r != 0; --r) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t c = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current row r + } // for each row r + + // Last iteration: r = 0. + { + const lno_t r = 0; + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t c = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current row r + } // last iteration: r = 0 } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = val(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } // for each column c -} + static void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; -template -void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X, - const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); - } - } + CommonOps co(block_size); - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; - } + Kokkos::deep_copy(X, Y); + + // If lno_t is unsigned and numRows is 0, the loop + // below will have entirely the wrong number of iterations. + if (numRows == 0) { + return; + } - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + // Don't use r >= 0 as the test, because that fails if + // lno_t is unsigned. We do r == 0 (last + // iteration) below. + for (lno_t r = numRows - 1; r != 0; --r) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + auto A_rr = co.zero(); + for (offset_type k = beg; k < end; ++k) { + const auto A_rc = co.get(val, k); + const lno_t c = ind(k); + if (r == c) { + co.pluseq(A_rr, A_rc); + } else { + for (lno_t j = 0; j < numVecs; ++j) { + co.gemv(X, A_rc, r, c, j); + } + } + } // for each entry A_rc in the current row r + for (lno_t j = 0; j < numVecs; ++j) { + co.template divide(X, A_rr, r, j); } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const local_ordinal_type c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + } // for each row r + + // Last iteration: r = 0. + { + const lno_t r = 0; + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + auto A_rr = co.zero(); + for (offset_type k = beg; k < end; ++k) { + const auto A_rc = co.get(val, k); + const lno_t c = ind(k); + if (r == c) { + co.pluseq(A_rr, A_rc); + } else { + for (lno_t j = 0; j < numVecs; ++j) { + co.gemv(X, A_rc, r, c, j); + } + } + } // for each entry A_rc in the current row r + for (lno_t j = 0; j < numVecs; ++j) { + co.template divide(X, A_rr, r, j); } - } // for each entry A_rc in the current column c + } // last iteration: r = 0 } -} -template -void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCscUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; } - } - - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; - } - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - if (r == c) { - X(c, j) = X(c, j) / A_rc; - } else { + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t r = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { X(r, j) -= A_rc * X(c, j); } - } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const matrix_scalar_type A_rc = STS::conj(val(beg)); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const lno_t c = 0; + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t r = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c } } -} -template -void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + Kokkos::deep_copy(X, Y); + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; } - } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = val(k); + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = end - 1; k >= beg; --k) { + const lno_t r = ind(k); + const auto A_rc = val(k); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const offset_type beg = ptr(0); + const auto A_rc = val(beg); /*(vqd 20 Jul 2020) This assumes that the diagonal entry has equal local row and column indices. That may not necessarily hold, depending on the row and column Maps. See note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - if (r == c) { - X(c, j) = X(c, j) / A_rc; - } else { - X(r, j) -= A_rc * X(c, j); - } + for (lno_t j = 0; j < numVecs; ++j) { + X(0, j) = X(0, j) / A_rc; } - } // for each entry A_rc in the current column c - } // for each column c -} - -template -void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X, - const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); } } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } // for each column c -} + static void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = val(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c + } // for each column c + } -template -void lowerTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; + } + + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const lno_t c = 0; + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c } } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); + static void upperTriSolveCscConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; + } + + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = end - 1; k >= beg; --k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const offset_type beg = ptr(0); + const scalar_t A_rc = STS::conj(val(beg)); /*(vqd 20 Jul 2020) This assumes that the diagonal entry has equal local row and column indices. That may not necessarily hold, depending on the row and column Maps. See note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - if (r == c) { - X(c, j) = X(c, j) / A_rc; - } else { + for (lno_t j = 0; j < numVecs; ++j) { + X(0, j) = X(0, j) / A_rc; + } + } + } + + static void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = val(k); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + } + + static void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + for (lno_t j = 0; j < numVecs; ++j) { X(r, j) -= A_rc * X(c, j); } - } - } // for each entry A_rc in the current column c - } // for each column c -} + } // for each entry A_rc in the current column c + } // for each column c + } + + static void lowerTriSolveCscConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + Kokkos::deep_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + } +}; } // namespace Sequential } // namespace Impl } // namespace KokkosSparse -#endif // KOKKOSSPARSE_IMPL_TRSM_HPP +#endif // KOKKOSSPARSE_TRSV_IMPL_HPP_ diff --git a/sparse/impl/KokkosSparse_trsv_spec.hpp b/sparse/impl/KokkosSparse_trsv_spec.hpp index 2e838337d2..a74f4ffe64 100644 --- a/sparse/impl/KokkosSparse_trsv_spec.hpp +++ b/sparse/impl/KokkosSparse_trsv_spec.hpp @@ -20,6 +20,7 @@ #include #include #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_BsrMatrix.hpp" // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -55,6 +56,22 @@ struct trsv_eti_spec_avail { Kokkos::Device, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ + }; \ + \ + template <> \ + struct trsv_eti_spec_avail< \ + KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + const SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -93,50 +110,52 @@ struct TRSV; if (trans[0] == 'N' || trans[0] == 'n') { // no transpose if (uplo[0] == 'L' || uplo[0] == 'l') { // lower triangular if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::lowerTriSolveCsrUnitDiag(X, A, B); + Wrap::lowerTriSolveCsrUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::lowerTriSolveCsr(X, A, B); + Wrap::lowerTriSolveCsr(X, A, B); } } else { // upper triangular if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::upperTriSolveCsrUnitDiag(X, A, B); + Wrap::upperTriSolveCsrUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::upperTriSolveCsr(X, A, B); + Wrap::upperTriSolveCsr(X, A, B); } } } else if (trans[0] == 'T' || trans[0] == 't') { // transpose if (uplo[0] == 'L' || uplo[0] == 'l') { // lower triangular // Transposed lower tri CSR => upper tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::upperTriSolveCscUnitDiag(X, A, B); + Wrap::upperTriSolveCscUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::upperTriSolveCsc(X, A, B); + Wrap::upperTriSolveCsc(X, A, B); } } else { // upper triangular // Transposed upper tri CSR => lower tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::lowerTriSolveCscUnitDiag(X, A, B); + Wrap::lowerTriSolveCscUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::lowerTriSolveCsc(X, A, B); + Wrap::lowerTriSolveCsc(X, A, B); } } } else if (trans[0] == 'C' || trans[0] == 'c') { // conj transpose if (uplo[0] == 'L' || uplo[0] == 'l') { // lower triangular // Transposed lower tri CSR => upper tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::upperTriSolveCscUnitDiagConj(X, A, B); + Wrap::upperTriSolveCscUnitDiagConj(X, A, B); } else { // non unit diagonal - Sequential::upperTriSolveCscConj(X, A, B); + Wrap::upperTriSolveCscConj(X, A, B); } } else { // upper triangular // Transposed upper tri CSR => lower tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::lowerTriSolveCscUnitDiagConj(X, A, B); + Wrap::lowerTriSolveCscUnitDiagConj(X, A, B); } else { // non unit diagonal - Sequential::lowerTriSolveCscConj(X, A, B); + Wrap::lowerTriSolveCscConj(X, A, B); } } } @@ -169,6 +188,20 @@ struct TRSV, \ Kokkos::MemoryTraits >, \ + false, true>; \ + \ + extern template struct TRSV< \ + KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + const SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #define KOKKOSSPARSE_TRSV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ @@ -186,6 +219,20 @@ struct TRSV, \ Kokkos::MemoryTraits >, \ + false, true>; \ + \ + template struct TRSV< \ + KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + const SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/sparse/src/KokkosSparse_trsv.hpp b/sparse/src/KokkosSparse_trsv.hpp index 1363542f1b..9b25811d10 100644 --- a/sparse/src/KokkosSparse_trsv.hpp +++ b/sparse/src/KokkosSparse_trsv.hpp @@ -68,11 +68,15 @@ void trsv(const char uplo[], const char trans[], const char diag[], typename XMV::non_const_value_type>::value, "KokkosBlas::trsv: The output x must be nonconst."); + static_assert(KokkosSparse::is_crs_matrix::value || + KokkosSparse::Experimental::is_bsr_matrix::value, + "KokkosBlas::trsv: A is not a CRS or BSR matrix."); + // The following three code lines have been moved up by Massimiliano Lupo // Pasini typedef typename BMV::size_type size_type; - const size_type numRows = static_cast(A.numRows()); - const size_type numCols = static_cast(A.numCols()); + const size_type numRows = static_cast(A.numPointRows()); + const size_type numCols = static_cast(A.numPointCols()); const size_type zero = static_cast(0); if (zero != numRows && uplo[0] != 'U' && uplo[0] != 'u' && uplo[0] != 'L' && @@ -117,13 +121,21 @@ void trsv(const char uplo[], const char trans[], const char diag[], KokkosKernels::Impl::throw_runtime_exception(os.str()); } - typedef KokkosSparse::CrsMatrix< + using AMatrix_Bsr_Internal = KokkosSparse::Experimental::BsrMatrix< typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - AMatrix_Internal A_i = A; + typename AMatrix::const_size_type>; + + using AMatrix_Internal = std::conditional_t< + KokkosSparse::is_crs_matrix::value, + KokkosSparse::CrsMatrix, + typename AMatrix::const_size_type>, + AMatrix_Bsr_Internal>; + + AMatrix_Internal A_i(A); typedef Kokkos::View< typename BMV::const_value_type**, typename BMV::array_layout, diff --git a/sparse/unit_test/Test_Sparse_gmres.hpp b/sparse/unit_test/Test_Sparse_gmres.hpp index 7b624c7f75..ee78d27729 100644 --- a/sparse/unit_test/Test_Sparse_gmres.hpp +++ b/sparse/unit_test/Test_Sparse_gmres.hpp @@ -103,7 +103,7 @@ struct GmresTest { constexpr auto m = 15; constexpr auto tol = TolMeta::value; constexpr auto diagDominance = 1; - constexpr bool verbose = true; + constexpr bool verbose = false; constexpr auto block_size = UseBlocks ? 10 : 1; auto A = get_A(n, diagDominance, block_size); diff --git a/sparse/unit_test/Test_Sparse_trsv.hpp b/sparse/unit_test/Test_Sparse_trsv.hpp index d580cc472d..8fb4763d71 100644 --- a/sparse/unit_test/Test_Sparse_trsv.hpp +++ b/sparse/unit_test/Test_Sparse_trsv.hpp @@ -34,89 +34,131 @@ typedef Kokkos::complex kokkos_complex_double; typedef Kokkos::complex kokkos_complex_float; namespace Test { -// TODO: remove this once MD develop branch is merge. -// The below functionolity exists in SparseUtils. - -template -void check_trsv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type b, - y_vector_type expected_x, int numMV, const char uplo[], - const char trans[]) { - // typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename scalar_view_t::value_type ScalarA; - double eps = (std::is_same::value - ? 2 * 1e-2 - : (std::is_same>::value || - std::is_same>::value) - ? 2 * 1e-1 - : 1e-7); - - Kokkos::fence(); - KokkosSparse::trsv(uplo, trans, "N", input_mat, b, x); - - for (int i = 0; i < numMV; ++i) { - auto x_i = Kokkos::subview(x, Kokkos::ALL(), i); - - auto expected_x_i = Kokkos::subview(expected_x, Kokkos::ALL(), i); - - EXPECT_NEAR_KK_1DVIEW(expected_x_i, x_i, eps); - } + +template < + typename Crs, typename LUType, typename size_type, + typename std::enable_if::value>::type* = nullptr> +LUType get_LU(char l_or_u, int n, size_type& nnz, int row_size_variance, + int bandwidth, int) { + auto LU = KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( + l_or_u, n, n, nnz, row_size_variance, bandwidth); + + return LU; +} + +template < + typename Crs, typename LUType, typename size_type, + typename std::enable_if::value>::type* = nullptr> +LUType get_LU(char l_or_u, int n, size_type& nnz, int row_size_variance, + int bandwidth, int block_size) { + auto LU_unblocked = + KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( + l_or_u, n, n, nnz, row_size_variance, bandwidth); + + // Convert to BSR + LUType LU(LU_unblocked, block_size); + + return LU; } -} // namespace Test template -void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, int numMV) { - lno_t numCols = numRows; + typename layout, typename device> +struct TrsvTest { + using View2D = Kokkos::View; + using execution_space = typename device::execution_space; + + using Crs = CrsMatrix; + using Bsr = BsrMatrix; + + // TODO: remove this once MD develop branch is merge. + // The below functionolity exists in SparseUtils. + template + static void check_trsv_mv(sp_matrix_type input_mat, View2D x, View2D b, + View2D expected_x, int numMV, const char uplo[], + const char trans[]) { + double eps = (std::is_same::value + ? 2 * 1e-2 + : (std::is_same>::value || + std::is_same>::value) + ? 2 * 1e-1 + : 1e-7); + + Kokkos::fence(); + KokkosSparse::trsv(uplo, trans, "N", input_mat, b, x); + + for (int i = 0; i < numMV; ++i) { + auto x_i = Kokkos::subview(x, Kokkos::ALL(), i); + + auto expected_x_i = Kokkos::subview(expected_x, Kokkos::ALL(), i); + + EXPECT_NEAR_KK_1DVIEW(expected_x_i, x_i, eps); + } + } + + template + static void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, int numMV) { + using sp_matrix_type = std::conditional_t; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - // typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + constexpr auto block_size = UseBlocks ? 10 : 1; - typedef Kokkos::View ViewTypeX; - typedef Kokkos::View ViewTypeY; + lno_t numCols = numRows; - ViewTypeX b_x("A", numRows, numMV); - ViewTypeY b_y("B", numCols, numMV); - ViewTypeX b_x_copy("B", numCols, numMV); + View2D b_x("A", numRows, numMV); + View2D b_y("B", numCols, numMV); + View2D b_x_copy("B", numCols, numMV); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); - Kokkos::fill_random(b_x_copy, rand_pool, scalar_t(10)); + Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::fill_random(b_x_copy, rand_pool, scalar_t(10)); - typename ViewTypeY::non_const_value_type alpha = 1; - typename ViewTypeY::non_const_value_type beta = 0; + scalar_t alpha = 1; + scalar_t beta = 0; - // this function creates a dense lower and upper triangular matrix. - // TODO: SHOULD CHANGE IT TO SPARSE - crsMat_t lower_part = - KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( - 'L', numRows, numCols, nnz, row_size_variance, bandwidth); + // this function creates a dense lower and upper triangular matrix. + auto lower_part = get_LU( + 'L', numRows, nnz, row_size_variance, bandwidth, block_size); - Test::shuffleMatrixEntries(lower_part.graph.row_map, lower_part.graph.entries, - lower_part.values); + Test::shuffleMatrixEntries(lower_part.graph.row_map, + lower_part.graph.entries, lower_part.values, + block_size); - KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N"); + KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y); + check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N"); - KokkosSparse::spmv("T", alpha, lower_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "T"); - // typedef typename Kokkos::View indexview; + if (!UseBlocks) { + KokkosSparse::spmv("T", alpha, lower_part, b_x_copy, beta, b_y); + check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "T"); + } - crsMat_t upper_part = - KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( - 'U', numRows, numCols, nnz, row_size_variance, bandwidth); + auto upper_part = get_LU( + 'U', numRows, nnz, row_size_variance, bandwidth, block_size); - Test::shuffleMatrixEntries(upper_part.graph.row_map, upper_part.graph.entries, - upper_part.values); + Test::shuffleMatrixEntries(upper_part.graph.row_map, + upper_part.graph.entries, upper_part.values, + block_size); - KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N"); + KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y); + check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N"); + + if (!UseBlocks) { + KokkosSparse::spmv("T", alpha, upper_part, b_x_copy, beta, b_y); + check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T"); + } + } +}; - KokkosSparse::spmv("T", alpha, upper_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T"); +} // namespace Test + +template +void test_trsv_mv() { + using TestStruct = Test::TrsvTest; + TestStruct::template test_trsv_mv(1000, 1000 * 30, 200, 10, 1); + TestStruct::template test_trsv_mv(800, 800 * 30, 100, 10, 5); + TestStruct::template test_trsv_mv(400, 400 * 20, 100, 5, 10); + TestStruct::template test_trsv_mv(1000, 1000 * 30, 200, 10, 1); + TestStruct::template test_trsv_mv(800, 800 * 30, 100, 10, 5); + TestStruct::template test_trsv_mv(400, 400 * 20, 100, 5, 10); } // Note BMK 7-22: the matrix generator used by this test always @@ -126,12 +168,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, TEST_F( \ TestCategory, \ sparse##_##trsv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ - test_trsv_mv( \ - 1000, 1000 * 30, 200, 10, 1); \ - test_trsv_mv( \ - 800, 800 * 30, 100, 10, 5); \ - test_trsv_mv( \ - 400, 400 * 20, 100, 5, 10); \ + test_trsv_mv(); \ } #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 236bcdd1c8..232b66242a 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -776,9 +776,11 @@ class RandCsMatrix { MapViewTypeD get_map() { return __getter_copy_helper(__map_d); } }; -/// \brief Randomly shuffle the entries in each row (col) of a Crs (Ccs) matrix. +/// \brief Randomly shuffle the entries in each row (col) of a Crs (Ccs) or Bsr +/// matrix. template -void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { +void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values, + const size_t block_size = 1) { using size_type = typename Rowptrs::non_const_value_type; using ordinal_type = typename Entries::value_type; auto rowptrsHost = @@ -789,6 +791,7 @@ void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), values); ordinal_type numRows = rowptrsHost.extent(0) ? (rowptrsHost.extent(0) - 1) : 0; + const size_t block_items = block_size * block_size; for (ordinal_type i = 0; i < numRows; i++) { size_type rowBegin = rowptrsHost(i); size_type rowEnd = rowptrsHost(i + 1); @@ -796,7 +799,9 @@ void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { ordinal_type swapRange = rowEnd - j; size_type swapOffset = j + (rand() % swapRange); std::swap(entriesHost(j), entriesHost(swapOffset)); - std::swap(valuesHost(j), valuesHost(swapOffset)); + std::swap_ranges(valuesHost.data() + j * block_items, + valuesHost.data() + (j + 1) * block_items, + valuesHost.data() + swapOffset * block_items); } } Kokkos::deep_copy(entries, entriesHost); From 0bf3dcf1e086bc8645e95cb5a85e6ece8fefc003 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 14 Feb 2024 12:54:44 -0700 Subject: [PATCH 160/326] Lapack - SVD: adding quick return when cuSOLVER is skipped (#2107) Currently we still run the tests on U, S and Vt which does not make sense since we actively skip this test because cuSOLVER does not support more columns than rows... --- lapack/unit_test/Test_Lapack_svd.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/unit_test/Test_Lapack_svd.hpp b/lapack/unit_test/Test_Lapack_svd.hpp index 032b9f86c6..3b2cd3d8d5 100644 --- a/lapack/unit_test/Test_Lapack_svd.hpp +++ b/lapack/unit_test/Test_Lapack_svd.hpp @@ -474,9 +474,6 @@ int impl_test_svd(const int m, const int n) { using vector_type = Kokkos::View; - std::cout << "Running impl_test_svd with sizes: " << m << "x" << n - << std::endl; - const mag_type max_val = 10; const mag_type tol = 1000 * max_val * KAT_S::eps(); @@ -499,6 +496,8 @@ int impl_test_svd(const int m, const int n) { Kokkos::Cuda>) { if (m >= n) { KokkosLapack::svd("A", "A", A, S, U, Vt); + } else { + return 0; } } else { KokkosLapack::svd("A", "A", A, S, U, Vt); From 5f719e5bd86bf7a8597878a496cb89a80c7c420c Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 15 Feb 2024 13:46:46 -0700 Subject: [PATCH 161/326] Fix build error in trsv on gcc8 --- sparse/impl/KokkosSparse_trsv_impl.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index 58a6f6f7d8..41f7083398 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -149,15 +149,16 @@ struct TrsvWrap { void divide(RangeMultiVectorType X, const UBlock& A, const lno_t r, const lno_t j) { UVector x(m_vec_data1.data(), m_block_size); + UVector y(m_vec_data2.data(), m_block_size); for (lno_t b = 0; b < m_block_size; ++b) { - x(b) = X(r * m_block_size + b, j); + y(b) = X(r * m_block_size + b, j); } - // If StaticPivoting is used, there are compiler errors related to + // if StaticPivoting is used, there are compiler errors related to // comparing complex and non-complex. using Algo = KokkosBatched::Gesv::NoPivoting; - KokkosBatched::SerialGesv::invoke(A, x, x, m_utmp); + KokkosBatched::SerialGesv::invoke(A, x, y, m_utmp); for (lno_t b = 0; b < m_block_size; ++b) { X(r * m_block_size + b, j) = x(b); From 0425cc6a0eb3637758a6eaf3de5385e46357d932 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Thu, 15 Feb 2024 16:33:47 -0600 Subject: [PATCH 162/326] Add a workaround for compilation errors with cuda-12.2.0 + gcc-12.3 (#2108) On Perlmutter@NERSC, I met this error /usr/lib64/gcc/x86_64-suse-linux/12/include/avx512fp16intrin.h(38): error: vector_size attribute requires an arithmetic or enum type typedef __half __v8hf __attribute__ ((__vector_size__ (16))); The workaround was mentioned at https://forums.developer.nvidia.com/t/including-cub-header-breakes-compilation-with-gcc-12-and-sse2-or-better/255018 --- batched/dense/src/KokkosBatched_Vector_SIMD.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index e27419e7c2..753904dbb9 100644 --- a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -513,6 +513,11 @@ class Vector, 4> { #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX__) || defined(__AVX2__) + +#if CUDA_VERSION < 12022 +#undef _Float16 +#endif + #include namespace KokkosBatched { @@ -668,6 +673,9 @@ class Vector >, 2> { #endif /* #if defined(__AVX__) || defined(__AVX2__) */ #if defined(__AVX512F__) +#if CUDA_VERSION < 12022 +#undef _Float16 +#endif #include namespace KokkosBatched { From 264dee2f729b0a126907b2a27076ad855ff4b078 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 15 Feb 2024 16:21:42 -0700 Subject: [PATCH 163/326] Lapack - SVD: fix for unit-test when MKL is enabled (#2110) This is really a problem with our implementation of the BLAS interface when MKL is enabled since MKL redefines the function signatures of blas functions using MKL_INT instead if int... --- blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 18 ++--- blas/tpls/KokkosBlas_Host_tpl.cpp | 73 +++++++++++--------- blas/tpls/KokkosBlas_Host_tpl.hpp | 17 +++-- lapack/unit_test/Test_Lapack_svd.hpp | 4 ++ 4 files changed, 65 insertions(+), 47 deletions(-) diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 66177e28a6..68bf2708ec 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -60,20 +60,20 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS," #SCALAR_TYPE \ "]"); \ const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = C.extent(0); \ - const int N = C.extent(1); \ - const int K = A.extent(A_t ? 0 : 1); \ + const KK_INT M = C.extent(0); \ + const KK_INT N = C.extent(1); \ + const KK_INT K = A.extent(A_t ? 0 : 1); \ \ bool A_is_lr = std::is_same::value; \ bool B_is_lr = std::is_same::value; \ bool C_is_lr = std::is_same::value; \ \ - const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ + const KK_INT AST = A_is_lr ? A.stride(0) : A.stride(1), \ + LDA = AST == 0 ? 1 : AST; \ + const KK_INT BST = B_is_lr ? B.stride(0) : B.stride(1), \ + LDB = BST == 0 ? 1 : BST; \ + const KK_INT CST = C_is_lr ? C.stride(0) : C.stride(1), \ + LDC = CST == 0 ? 1 : CST; \ \ const BASE_SCALAR_TYPE alpha_val = alpha, beta_val = beta; \ if (!A_is_lr && !B_is_lr && !C_is_lr) \ diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index ec739aa98a..68f2810907 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -22,6 +22,8 @@ #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) +using KokkosBlas::Impl::KK_INT; + /// Fortran headers extern "C" { @@ -339,26 +341,27 @@ void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, int*, /// Gemm /// -void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, int*, int*, int*, - const float*, const float*, int*, - const float*, int*, const float*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, int*, int*, int*, - const double*, const double*, int*, - const double*, int*, const double*, - /* */ double*, int*); -void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, int*, int*, int*, - const std::complex*, - const std::complex*, int*, - const std::complex*, int*, +void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const float*, const float*, KK_INT*, + const float*, KK_INT*, const float*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const double*, const double*, + KK_INT*, const double*, KK_INT*, + const double*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, int*, int*, int*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - const std::complex*, - /* */ std::complex*, int*); + /* */ std::complex*, KK_INT*); /// /// Herk @@ -632,10 +635,11 @@ void HostBlas::trsv(const char uplo, const char transa, const char diag, F77_FUNC_STRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, int m, int n, - int k, const float alpha, const float* a, int lda, - const float* b, int ldb, const float beta, - /* */ float* c, int ldc) { +void HostBlas::gemm(const char transa, const char transb, KK_INT m, + KK_INT n, KK_INT k, const float alpha, + const float* a, KK_INT lda, const float* b, + KK_INT ldb, const float beta, + /* */ float* c, KK_INT ldc) { F77_FUNC_SGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } @@ -750,10 +754,11 @@ void HostBlas::trsv(const char uplo, const char transa, const char diag, F77_FUNC_DTRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, int m, int n, - int k, const double alpha, const double* a, int lda, - const double* b, int ldb, const double beta, - /* */ double* c, int ldc) { +void HostBlas::gemm(const char transa, const char transb, KK_INT m, + KK_INT n, KK_INT k, const double alpha, + const double* a, KK_INT lda, const double* b, + KK_INT ldb, const double beta, + /* */ double* c, KK_INT ldc) { F77_FUNC_DGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } @@ -906,10 +911,10 @@ void HostBlas >::trsv(const char uplo, const char transa, } template <> void HostBlas >::gemm( - const char transa, const char transb, int m, int n, int k, - const std::complex alpha, const std::complex* a, int lda, - const std::complex* b, int ldb, const std::complex beta, - /* */ std::complex* c, int ldc) { + const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_CGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, @@ -1081,10 +1086,10 @@ void HostBlas >::trsv(const char uplo, const char transa, template <> void HostBlas >::gemm( - const char transa, const char transb, int m, int n, int k, - const std::complex alpha, const std::complex* a, int lda, - const std::complex* b, int ldb, const std::complex beta, - /* */ std::complex* c, int ldc) { + const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_ZGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 29afff4d62..8e8781bfcf 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -25,10 +25,19 @@ #include "Kokkos_ArithTraits.hpp" #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#include "mkl_types.h" +#endif namespace KokkosBlas { namespace Impl { +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +using KK_INT = MKL_INT; +#else +using KK_INT = int; +#endif + template struct HostBlas { typedef Kokkos::ArithTraits ats; @@ -97,10 +106,10 @@ struct HostBlas { const T *a, int lda, /* */ T *b, int ldb); - static void gemm(const char transa, const char transb, int m, int n, int k, - const T alpha, const T *a, int lda, const T *b, int ldb, - const T beta, - /* */ T *c, int ldc); + static void gemm(const char transa, const char transb, KK_INT m, KK_INT n, + KK_INT k, const T alpha, const T *a, KK_INT lda, const T *b, + KK_INT ldb, const T beta, + /* */ T *c, KK_INT ldc); static void herk(const char transa, const char transb, int n, int k, const T alpha, const T *a, int lda, const T beta, diff --git a/lapack/unit_test/Test_Lapack_svd.hpp b/lapack/unit_test/Test_Lapack_svd.hpp index 3b2cd3d8d5..6cf161fd3b 100644 --- a/lapack/unit_test/Test_Lapack_svd.hpp +++ b/lapack/unit_test/Test_Lapack_svd.hpp @@ -529,8 +529,10 @@ int test_svd() { Kokkos::View; ret = Test::impl_analytic_2x2_svd(); + EXPECT_EQ(ret, 0); ret = Test::impl_analytic_2x3_svd(); + EXPECT_EQ(ret, 0); ret = Test::impl_test_svd(0, 0); EXPECT_EQ(ret, 0); @@ -558,8 +560,10 @@ int test_svd() { Kokkos::View; ret = Test::impl_analytic_2x2_svd(); + EXPECT_EQ(ret, 0); ret = Test::impl_analytic_2x3_svd(); + EXPECT_EQ(ret, 0); ret = Test::impl_test_svd(0, 0); EXPECT_EQ(ret, 0); From 23abac417aaa6cb6c485e7cf3ae60ee81e5d9ea5 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 15 Feb 2024 16:25:10 -0700 Subject: [PATCH 164/326] Revert "Merge pull request #2037 from ndellingwood/remove-rocsolver-optional-dependency" (#2106) This reverts commit 5a36d577e725546062af3b297eec87e23a40ab58, reversing changes made to 2c66d291f9b5512e17f9375304902b6ba42133b2. --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 9ac02b06f6..a52f0c098c 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES Kokkos - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE CUSOLVER ROCBLAS ROCSPARSE + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE CUSOLVER ROCBLAS ROCSPARSE ROCSOLVER TEST_OPTIONAL_TPLS yamlcpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in From e3026de618b8c8887e88202821779d93da2bf179 Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Fri, 16 Feb 2024 14:57:55 -0600 Subject: [PATCH 165/326] Fixing missing inclusion in source file --- example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp index 6a67c1aec4..eacf134f89 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp @@ -15,6 +15,7 @@ //@HEADER #include +#include #include "Kokkos_Core.hpp" From fdadc74c0946c9e6258a2f10a21512c064bc1541 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 19 Feb 2024 09:21:24 -0700 Subject: [PATCH 166/326] BLAS - MKL: fixing HostBlas calls to handle MKL_INT type (#2112) MKL redefines the BLAS interface based on how MKL_INT is defined we need to wrap that definition with our own Kokkos Kernels INT type to make both compatible with regular BLAS. applying clang-format --- blas/tpls/KokkosBlas_Host_tpl.cpp | 822 +++++++++++++++--------------- blas/tpls/KokkosBlas_Host_tpl.hpp | 93 ++-- 2 files changed, 464 insertions(+), 451 deletions(-) diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 68f2810907..50aab57c73 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -30,66 +30,68 @@ extern "C" { /// /// scal /// -void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, - /* */ float* x, const int* x_inc); -void F77_BLAS_MANGLE(dscal, DSCAL)(const int* N, const double* alpha, - /* */ double* x, const int* x_inc); +void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, + /* */ float* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, + /* */ double* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(cscal, - CSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(zscal, - ZSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); /// /// max /// -int F77_BLAS_MANGLE(isamax, ISAMAX)(const int* N, const float* x, - const int* x_inc); -int F77_BLAS_MANGLE(idamax, IDAMAX)(const int* N, const double* x, - const int* x_inc); -int F77_BLAS_MANGLE(icamax, ICAMAX)(const int* N, const std::complex* x, - const int* x_inc); -int F77_BLAS_MANGLE(izamax, IZAMAX)(const int* N, const std::complex* x, - const int* x_inc); +KK_INT F77_BLAS_MANGLE(isamax, ISAMAX)(const KK_INT* N, const float* x, + const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(idamax, IDAMAX)(const KK_INT* N, const double* x, + const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(icamax, ICAMAX)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(izamax, IZAMAX)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc); /// /// nrm2 /// -float F77_BLAS_MANGLE(snrm2, SNRM2)(const int* N, const float* x, - const int* x_inc); -double F77_BLAS_MANGLE(dnrm2, DNRM2)(const int* N, const double* x, - const int* x_inc); -float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const int* N, +float F77_BLAS_MANGLE(snrm2, SNRM2)(const KK_INT* N, const float* x, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dnrm2, DNRM2)(const KK_INT* N, const double* x, + const KK_INT* x_inc); +float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const KK_INT* N, const std::complex* x, - const int* x_inc); -double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const int* N, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const KK_INT* N, const std::complex* x, - const int* x_inc); + const KK_INT* x_inc); /// /// sum /// -float F77_BLAS_MANGLE(sasum, SASUM)(const int* N, const float* x, - const int* x_inc); -double F77_BLAS_MANGLE(dasum, DASUM)(const int* N, const double* x, - const int* x_inc); -float F77_BLAS_MANGLE(scasum, SCASUM)(const int* N, +float F77_BLAS_MANGLE(sasum, SASUM)(const KK_INT* N, const float* x, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dasum, DASUM)(const KK_INT* N, const double* x, + const KK_INT* x_inc); +float F77_BLAS_MANGLE(scasum, SCASUM)(const KK_INT* N, const std::complex* x, - const int* x_inc); -double F77_BLAS_MANGLE(dzasum, DZASUM)(const int* N, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dzasum, DZASUM)(const KK_INT* N, const std::complex* x, - const int* x_inc); + const KK_INT* x_inc); /// /// dot /// -float F77_BLAS_MANGLE(sdot, SDOT)(const int* N, const float* x, - const int* x_inc, const float* y, - const int* y_inc); -double F77_BLAS_MANGLE(ddot, DDOT)(const int* N, const double* x, - const int* x_inc, const double* y, - const int* y_inc); +float F77_BLAS_MANGLE(sdot, SDOT)(const KK_INT* N, const float* x, + const KK_INT* x_inc, const float* y, + const KK_INT* y_inc); +double F77_BLAS_MANGLE(ddot, DDOT)(const KK_INT* N, const double* x, + const KK_INT* x_inc, const double* y, + const KK_INT* y_inc); #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) // clang-format off // For the return type, don't use std::complex, otherwise compiler will complain @@ -104,77 +106,78 @@ typedef struct { double vals[2]; } _kk_double2; -_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, +_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const KK_INT* N, const std::complex* x, - const int* x_inc, + const KK_INT* x_inc, const std::complex* y, - const int* y_inc); -_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const int* N, + const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const KK_INT* N, const std::complex* x, - const int* x_inc, + const KK_INT* x_inc, const std::complex* y, - const int* y_inc); -_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, + const KK_INT* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const KK_INT* N, const std::complex* x, - const int* x_inc, + const KK_INT* x_inc, const std::complex* y, - const int* y_inc); -_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const int* N, + const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const KK_INT* N, const std::complex* x, - const int* x_inc, + const KK_INT* x_inc, const std::complex* y, - const int* y_inc); + const KK_INT* y_inc); #else void F77_BLAS_MANGLE(cdotu, - CDOTU)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + CDOTU)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(zdotu, - ZDOTU)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + ZDOTU)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(cdotc, - CDOTC)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + CDOTC)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(zdotc, - ZDOTC)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + ZDOTC)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); #endif /// /// axpy /// -void F77_BLAS_MANGLE(saxpy, SAXPY)(const int* N, const float* alpha, - const float* x, const int* x_inc, - /* */ float* y, const int* y_inc); -void F77_BLAS_MANGLE(daxpy, DAXPY)(const int* N, const double* alpha, - const double* x, const int* x_inc, - /* */ double* y, const int* y_inc); +void F77_BLAS_MANGLE(saxpy, SAXPY)(const KK_INT* N, const float* alpha, + const float* x, const KK_INT* x_inc, + /* */ float* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(daxpy, DAXPY)(const KK_INT* N, const double* alpha, + const double* x, const KK_INT* x_inc, + /* */ double* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(caxpy, - CAXPY)(const int* N, const std::complex* alpha, - const std::complex* x, const int* x_inc, - /* */ std::complex* y, const int* y_inc); + CAXPY)(const KK_INT* N, const std::complex* alpha, + const std::complex* x, const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(zaxpy, - ZAXPY)(const int* N, const std::complex* alpha, - const std::complex* x, const int* x_inc, - /* */ std::complex* y, const int* y_inc); + ZAXPY)(const KK_INT* N, const std::complex* alpha, + const std::complex* x, const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); /// /// rot /// -void F77_BLAS_MANGLE(srot, SROT)(int const* N, float* X, int const* incx, - float* Y, int const* incy, float* c, float* s); -void F77_BLAS_MANGLE(drot, DROT)(int const* N, double* X, int const* incx, - double* Y, int const* incy, double* c, +void F77_BLAS_MANGLE(srot, SROT)(KK_INT const* N, float* X, KK_INT const* incx, + float* Y, KK_INT const* incy, float* c, + float* s); +void F77_BLAS_MANGLE(drot, DROT)(KK_INT const* N, double* X, KK_INT const* incx, + double* Y, KK_INT const* incy, double* c, double* s); -void F77_BLAS_MANGLE(crot, CROT)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy, float* c, float* s); -void F77_BLAS_MANGLE(zrot, ZROT)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy, double* c, double* s); +void F77_BLAS_MANGLE(crot, CROT)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy, float* c, float* s); +void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy, double* c, double* s); /// /// rotg @@ -191,12 +194,12 @@ void F77_BLAS_MANGLE(zrotg, ZROTG)(std::complex* a, /// /// rotm /// -void F77_BLAS_MANGLE(srotm, SROTM)(const int* n, float* X, const int* incx, - float* Y, const int* incy, - float const* param); -void F77_BLAS_MANGLE(drotm, DROTM)(const int* n, double* X, const int* incx, - double* Y, const int* incy, - double const* param); +void F77_BLAS_MANGLE(srotm, SROTM)(const KK_INT* n, float* X, + const KK_INT* incx, float* Y, + const KK_INT* incy, float const* param); +void F77_BLAS_MANGLE(drotm, DROTM)(const KK_INT* n, double* X, + const KK_INT* incx, double* Y, + const KK_INT* incy, double const* param); /// /// rotmg @@ -209,72 +212,78 @@ void F77_BLAS_MANGLE(drotmg, DROTMG)(double* d1, double* d2, double* x1, /// /// swap /// -void F77_BLAS_MANGLE(sswap, SSWAP)(int const* N, float* X, int const* incx, - float* Y, int const* incy); -void F77_BLAS_MANGLE(dswap, DSWAP)(int const* N, double* X, int const* incx, - double* Y, int const* incy); -void F77_BLAS_MANGLE(cswap, CSWAP)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy); -void F77_BLAS_MANGLE(zswap, ZSWAP)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy); +void F77_BLAS_MANGLE(sswap, SSWAP)(KK_INT const* N, float* X, + KK_INT const* incx, float* Y, + KK_INT const* incy); +void F77_BLAS_MANGLE(dswap, DSWAP)(KK_INT const* N, double* X, + KK_INT const* incx, double* Y, + KK_INT const* incy); +void F77_BLAS_MANGLE(cswap, CSWAP)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy); +void F77_BLAS_MANGLE(zswap, ZSWAP)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy); /// /// Gemv /// -void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, int*, int*, const float*, - const float*, int*, const float*, int*, +void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, KK_INT*, KK_INT*, const float*, + const float*, KK_INT*, const float*, KK_INT*, const float*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, int*, int*, const double*, - const double*, int*, const double*, int*, - const double*, - /* */ double*, int*); -void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, int*, int*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, KK_INT*, KK_INT*, const double*, + const double*, KK_INT*, const double*, + KK_INT*, const double*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, int*, int*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); + /* */ std::complex*, KK_INT*); /// /// Ger /// -void F77_BLAS_MANGLE(sger, SGER)(int*, int*, const float*, const float*, int*, - const float*, int*, float*, int*); -void F77_BLAS_MANGLE(dger, DGER)(int*, int*, const double*, const double*, int*, - const double*, int*, double*, int*); -void F77_BLAS_MANGLE(cgeru, CGERU)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(zgeru, ZGERU)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); +void F77_BLAS_MANGLE(sger, SGER)(KK_INT*, KK_INT*, const float*, const float*, + KK_INT*, const float*, KK_INT*, float*, + KK_INT*); +void F77_BLAS_MANGLE(dger, DGER)(KK_INT*, KK_INT*, const double*, const double*, + KK_INT*, const double*, KK_INT*, double*, + KK_INT*); +void F77_BLAS_MANGLE(cgeru, CGERU)(KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgeru, ZGERU)(KK_INT*, KK_INT*, + const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(cgerc, CGERC)(KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgerc, ZGERC)(KK_INT*, KK_INT*, + const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); /// /// Syr /// -void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, int*, const float*, const float*, - int*, float*, int*); -void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, int*, const double*, - const double*, int*, double*, int*); +void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, KK_INT*, const float*, + const float*, KK_INT*, float*, KK_INT*); +void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, + const double*, KK_INT*, double*, KK_INT*); // Although there is a cgeru, there is no csyru // Although there is a zgeru, there is no zsyru // Although there is a cgerc, there is no csyrc, but there is cher (see below) @@ -284,22 +293,22 @@ void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, int*, const double*, /// Her /// -void F77_BLAS_MANGLE(cher, CHER)(const char*, int*, const float*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(zher, ZHER)(const char*, int*, const double*, - const std::complex*, int*, - std::complex*, int*); +void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); /// /// Syr2 /// -void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, int*, const float*, - const float*, const int*, const float*, int*, - float*, int*); -void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, int*, const double*, - const double*, const int*, const double*, - int*, double*, int*); +void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, KK_INT*, const float*, + const float*, const KK_INT*, const float*, + KK_INT*, float*, KK_INT*); +void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, + const double*, const KK_INT*, const double*, + KK_INT*, double*, KK_INT*); // Although there is a cgeru, there is no csyr2u // Although there is a zgeru, there is no zsyr2u // Although there is a cgerc, there is no csyr2c, but there is cher2 (see below) @@ -309,33 +318,34 @@ void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, int*, const double*, /// Her2 /// -void F77_BLAS_MANGLE(cher2, CHER2)(const char*, int*, +void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, int*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); /// /// Trsv /// -void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, int*, - const float*, int*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, int*, - const double*, int*, - /* */ double*, int*); -void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, int*, - const std::complex*, int*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, int*, - const std::complex*, int*, - /* */ std::complex*, int*); +void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, + KK_INT*, const float*, KK_INT*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, + KK_INT*, const double*, KK_INT*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, + KK_INT*, const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, + KK_INT*, const std::complex*, + KK_INT*, + /* */ std::complex*, KK_INT*); /// /// Gemm @@ -367,82 +377,82 @@ void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, /// Herk /// -void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, int*, int*, - const float*, const float*, int*, +void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, KK_INT*, KK_INT*, + const float*, const float*, KK_INT*, const float*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, int*, int*, - const double*, const double*, int*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, KK_INT*, KK_INT*, + const double*, const double*, KK_INT*, const double*, - /* */ double*, int*); -void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, int*, int*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, int*, int*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); + /* */ std::complex*, KK_INT*); /// /// Trmm /// void F77_BLAS_MANGLE(strmm, STRMM)(const char*, const char*, const char*, - const char*, int*, int*, const float*, - const float*, int*, - /* */ float*, int*); + const char*, KK_INT*, KK_INT*, const float*, + const float*, KK_INT*, + /* */ float*, KK_INT*); void F77_BLAS_MANGLE(dtrmm, DTRMM)(const char*, const char*, const char*, - const char*, int*, int*, const double*, - const double*, int*, - /* */ double*, int*); + const char*, KK_INT*, KK_INT*, const double*, + const double*, KK_INT*, + /* */ double*, KK_INT*); void F77_BLAS_MANGLE(ctrmm, CTRMM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(ztrmm, ZTRMM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); /// /// Trsm /// void F77_BLAS_MANGLE(strsm, STRSM)(const char*, const char*, const char*, - const char*, int*, int*, const float*, - const float*, int*, - /* */ float*, int*); + const char*, KK_INT*, KK_INT*, const float*, + const float*, KK_INT*, + /* */ float*, KK_INT*); void F77_BLAS_MANGLE(dtrsm, DTRSM)(const char*, const char*, const char*, - const char*, int*, int*, const double*, - const double*, int*, - /* */ double*, int*); + const char*, KK_INT*, KK_INT*, const double*, + const double*, KK_INT*, + /* */ double*, KK_INT*); void F77_BLAS_MANGLE(ctrsm, CTRSM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); } -void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, - /* */ float* x, const int* x_inc); -void F77_BLAS_MANGLE(dscal, DSCAL)(const int* N, const double* alpha, - /* */ double* x, const int* x_inc); +void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, + /* */ float* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, + /* */ double* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(cscal, - CSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(zscal, - ZSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); #define F77_FUNC_SSCAL F77_BLAS_MANGLE(sscal, SSCAL) #define F77_FUNC_DSCAL F77_BLAS_MANGLE(dscal, DSCAL) @@ -554,35 +564,36 @@ namespace Impl { /// template <> -void HostBlas::scal(int n, const float alpha, - /* */ float* x, int x_inc) { +void HostBlas::scal(KK_INT n, const float alpha, + /* */ float* x, KK_INT x_inc) { F77_FUNC_SSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas::iamax(int n, const float* x, int x_inc) { +KK_INT HostBlas::iamax(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_ISAMAX(&n, x, &x_inc); } template <> -float HostBlas::nrm2(int n, const float* x, int x_inc) { +float HostBlas::nrm2(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_SNRM2(&n, x, &x_inc); } template <> -float HostBlas::asum(int n, const float* x, int x_inc) { +float HostBlas::asum(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_SASUM(&n, x, &x_inc); } template <> -float HostBlas::dot(int n, const float* x, int x_inc, const float* y, - int y_inc) { +float HostBlas::dot(KK_INT n, const float* x, KK_INT x_inc, + const float* y, KK_INT y_inc) { return F77_FUNC_SDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(int n, const float alpha, const float* x, int x_inc, - /* */ float* y, int y_inc) { +void HostBlas::axpy(KK_INT n, const float alpha, const float* x, + KK_INT x_inc, + /* */ float* y, KK_INT y_inc) { F77_FUNC_SAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(int const N, float* X, int const incx, float* Y, - int const incy, float* c, float* s) { +void HostBlas::rot(KK_INT const N, float* X, KK_INT const incx, float* Y, + KK_INT const incy, float* c, float* s) { F77_FUNC_SROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -590,8 +601,8 @@ void HostBlas::rotg(float* a, float* b, float* c, float* s) { F77_FUNC_SROTG(a, b, c, s); } template <> -void HostBlas::rotm(const int n, float* X, const int incx, float* Y, - const int incy, const float* param) { +void HostBlas::rotm(const KK_INT n, float* X, const KK_INT incx, + float* Y, const KK_INT incy, const float* param) { F77_FUNC_SROTM(&n, X, &incx, Y, &incy, param); } template <> @@ -600,38 +611,38 @@ void HostBlas::rotmg(float* d1, float* d2, float* x1, const float* y1, F77_FUNC_SROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(int const N, float* X, int const incx, float* Y, - int const incy) { +void HostBlas::swap(KK_INT const N, float* X, KK_INT const incx, + float* Y, KK_INT const incy) { F77_FUNC_SSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, int m, int n, const float alpha, - const float* a, int lda, const float* b, int ldb, - const float beta, - /* */ float* c, int ldc) { +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, + const float alpha, const float* a, KK_INT lda, + const float* b, KK_INT ldb, const float beta, + /* */ float* c, KK_INT ldc) { F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(int m, int n, const float alpha, const float* x, - int incx, const float* y, int incy, float* a, - int lda) { +void HostBlas::ger(KK_INT m, KK_INT n, const float alpha, const float* x, + KK_INT incx, const float* y, KK_INT incy, float* a, + KK_INT lda) { F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, int n, const float alpha, - const float* x, int incx, float* a, int lda) { +void HostBlas::syr(const char uplo, KK_INT n, const float alpha, + const float* x, KK_INT incx, float* a, KK_INT lda) { F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> -void HostBlas::syr2(const char uplo, int n, const float alpha, - const float* x, int incx, const float* y, int incy, - float* a, int lda) { +void HostBlas::syr2(const char uplo, KK_INT n, const float alpha, + const float* x, KK_INT incx, const float* y, + KK_INT incy, float* a, KK_INT lda) { F77_FUNC_SSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, - int m, const float* a, int lda, - /* */ float* b, int ldb) { + KK_INT m, const float* a, KK_INT lda, + /* */ float* b, KK_INT ldb) { F77_FUNC_STRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> @@ -644,25 +655,25 @@ void HostBlas::gemm(const char transa, const char transb, KK_INT m, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, int n, int k, - const float alpha, const float* a, int lda, - const float beta, - /* */ float* c, int ldc) { +void HostBlas::herk(const char transa, const char transb, KK_INT n, + KK_INT k, const float alpha, const float* a, + KK_INT lda, const float beta, + /* */ float* c, KK_INT ldc) { F77_FUNC_SSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const float alpha, - const float* a, int lda, - /* */ float* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const float alpha, const float* a, KK_INT lda, + /* */ float* b, KK_INT ldb) { F77_FUNC_STRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const float alpha, - const float* a, int lda, - /* */ float* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const float alpha, const float* a, KK_INT lda, + /* */ float* b, KK_INT ldb) { F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } @@ -672,36 +683,36 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, /// template <> -void HostBlas::scal(int n, const double alpha, - /* */ double* x, int x_inc) { +void HostBlas::scal(KK_INT n, const double alpha, + /* */ double* x, KK_INT x_inc) { F77_FUNC_DSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas::iamax(int n, const double* x, int x_inc) { +KK_INT HostBlas::iamax(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_IDAMAX(&n, x, &x_inc); } template <> -double HostBlas::nrm2(int n, const double* x, int x_inc) { +double HostBlas::nrm2(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_DNRM2(&n, x, &x_inc); } template <> -double HostBlas::asum(int n, const double* x, int x_inc) { +double HostBlas::asum(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_DASUM(&n, x, &x_inc); } template <> -double HostBlas::dot(int n, const double* x, int x_inc, const double* y, - int y_inc) { +double HostBlas::dot(KK_INT n, const double* x, KK_INT x_inc, + const double* y, KK_INT y_inc) { return F77_FUNC_DDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(int n, const double alpha, const double* x, - int x_inc, - /* */ double* y, int y_inc) { +void HostBlas::axpy(KK_INT n, const double alpha, const double* x, + KK_INT x_inc, + /* */ double* y, KK_INT y_inc) { F77_FUNC_DAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(int const N, double* X, int const incx, double* Y, - int const incy, double* c, double* s) { +void HostBlas::rot(KK_INT const N, double* X, KK_INT const incx, + double* Y, KK_INT const incy, double* c, double* s) { F77_FUNC_DROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -709,8 +720,8 @@ void HostBlas::rotg(double* a, double* b, double* c, double* s) { F77_FUNC_DROTG(a, b, c, s); } template <> -void HostBlas::rotm(const int n, double* X, const int incx, double* Y, - const int incy, const double* param) { +void HostBlas::rotm(const KK_INT n, double* X, const KK_INT incx, + double* Y, const KK_INT incy, const double* param) { F77_FUNC_DROTM(&n, X, &incx, Y, &incy, param); } template <> @@ -719,38 +730,39 @@ void HostBlas::rotmg(double* d1, double* d2, double* x1, F77_FUNC_DROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(int const N, double* X, int const incx, double* Y, - int const incy) { +void HostBlas::swap(KK_INT const N, double* X, KK_INT const incx, + double* Y, KK_INT const incy) { F77_FUNC_DSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, int m, int n, const double alpha, - const double* a, int lda, const double* b, int ldb, - const double beta, - /* */ double* c, int ldc) { +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, + const double alpha, const double* a, KK_INT lda, + const double* b, KK_INT ldb, const double beta, + /* */ double* c, KK_INT ldc) { F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(int m, int n, const double alpha, const double* x, - int incx, const double* y, int incy, double* a, - int lda) { +void HostBlas::ger(KK_INT m, KK_INT n, const double alpha, + const double* x, KK_INT incx, const double* y, + KK_INT incy, double* a, KK_INT lda) { F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, int n, const double alpha, - const double* x, int incx, double* a, int lda) { +void HostBlas::syr(const char uplo, KK_INT n, const double alpha, + const double* x, KK_INT incx, double* a, + KK_INT lda) { F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> -void HostBlas::syr2(const char uplo, int n, const double alpha, - const double* x, int incx, const double* y, - int incy, double* a, int lda) { +void HostBlas::syr2(const char uplo, KK_INT n, const double alpha, + const double* x, KK_INT incx, const double* y, + KK_INT incy, double* a, KK_INT lda) { F77_FUNC_DSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, - int m, const double* a, int lda, - /* */ double* b, int ldb) { + KK_INT m, const double* a, KK_INT lda, + /* */ double* b, KK_INT ldb) { F77_FUNC_DTRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> @@ -763,25 +775,25 @@ void HostBlas::gemm(const char transa, const char transb, KK_INT m, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, int n, int k, - const double alpha, const double* a, int lda, - const double beta, - /* */ double* c, int ldc) { +void HostBlas::herk(const char transa, const char transb, KK_INT n, + KK_INT k, const double alpha, const double* a, + KK_INT lda, const double beta, + /* */ double* c, KK_INT ldc) { F77_FUNC_DSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const double alpha, - const double* a, int lda, - /* */ double* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const double alpha, const double* a, KK_INT lda, + /* */ double* b, KK_INT ldb) { F77_FUNC_DTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const double alpha, - const double* a, int lda, - /* */ double* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const double alpha, const double* a, KK_INT lda, + /* */ double* b, KK_INT ldb) { F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } @@ -791,31 +803,34 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, /// template <> -void HostBlas >::scal(int n, +void HostBlas >::scal(KK_INT n, const std::complex alpha, /* */ std::complex* x, - int x_inc) { + KK_INT x_inc) { F77_FUNC_CSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas >::iamax(int n, const std::complex* x, - int x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_ICAMAX(&n, x, &x_inc); } template <> -float HostBlas >::nrm2(int n, const std::complex* x, - int x_inc) { +float HostBlas >::nrm2(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_SCNRM2(&n, x, &x_inc); } template <> -float HostBlas >::asum(int n, const std::complex* x, - int x_inc) { +float HostBlas >::asum(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_SCASUM(&n, x, &x_inc); } template <> std::complex HostBlas >::dot( - int n, const std::complex* x, int x_inc, - const std::complex* y, int y_inc) { + KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) _kk_float2 res = F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); return std::complex(res.vals[0], res.vals[1]); @@ -826,18 +841,20 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(int n, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, - int x_inc, + KK_INT x_inc, /* */ std::complex* y, - int y_inc) { + KK_INT y_inc) { F77_FUNC_CAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot(int const N, std::complex* X, - int const incx, std::complex* Y, - int const incy, float* c, float* s) { +void HostBlas >::rot(KK_INT const N, std::complex* X, + KK_INT const incx, + std::complex* Y, + KK_INT const incy, float* c, + float* s) { F77_FUNC_CROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -847,38 +864,37 @@ void HostBlas >::rotg(std::complex* a, F77_FUNC_CROTG(a, b, c, s); } template <> -void HostBlas >::swap(int const N, std::complex* X, - int const incx, +void HostBlas >::swap(KK_INT const N, + std::complex* X, + KK_INT const incx, std::complex* Y, - int const incy) { + KK_INT const incy) { F77_FUNC_CSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas >::gemv(const char trans, int m, int n, - const std::complex alpha, - const std::complex* a, int lda, - const std::complex* b, int ldb, - const std::complex beta, - /* */ std::complex* c, - int ldc) { +void HostBlas >::gemv( + const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_CGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> void HostBlas >::geru( - int m, int n, const std::complex alpha, const std::complex* x, - int incx, const std::complex* y, int incy, std::complex* a, - int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> void HostBlas >::gerc( - int m, int n, const std::complex alpha, const std::complex* x, - int incx, const std::complex* y, int incy, std::complex* a, - int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); @@ -886,26 +902,27 @@ void HostBlas >::gerc( template <> template <> void HostBlas >::cher( - const char uplo, int n, const float alpha, const std::complex* x, - int incx, std::complex* a, int lda) { + const char uplo, KK_INT n, const float alpha, const std::complex* x, + KK_INT incx, std::complex* a, KK_INT lda) { F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> void HostBlas >::cher2( - const char uplo, int n, const std::complex alpha, - const std::complex* x, int incx, const std::complex* y, - int incy, std::complex* a, int lda) { + const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> void HostBlas >::trsv(const char uplo, const char transa, - const char diag, int m, - const std::complex* a, int lda, + const char diag, KK_INT m, + const std::complex* a, + KK_INT lda, /* */ std::complex* b, - int ldb) { + KK_INT ldb) { F77_FUNC_CTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } @@ -921,37 +938,31 @@ void HostBlas >::gemm( (std::complex*)c, &ldc); } template <> -void HostBlas >::herk(const char transa, const char transb, - int n, int k, - const std::complex alpha, - const std::complex* a, int lda, - const std::complex beta, - /* */ std::complex* c, - int ldc) { +void HostBlas >::herk( + const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_CHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::trmm(const char side, const char uplo, - const char transa, const char diag, - int m, int n, - const std::complex alpha, - const std::complex* a, int lda, - /* */ std::complex* b, - int ldb) { +void HostBlas >::trmm( + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_CTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> -void HostBlas >::trsm(const char side, const char uplo, - const char transa, const char diag, - int m, int n, - const std::complex alpha, - const std::complex* a, int lda, - /* */ std::complex* b, - int ldb) { +void HostBlas >::trsm( + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); @@ -962,33 +973,34 @@ void HostBlas >::trsm(const char side, const char uplo, /// template <> -void HostBlas >::scal(int n, +void HostBlas >::scal(KK_INT n, const std::complex alpha, /* */ std::complex* x, - int x_inc) { + KK_INT x_inc) { F77_FUNC_ZSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas >::iamax(int n, const std::complex* x, - int x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_IZAMAX(&n, x, &x_inc); } template <> -double HostBlas >::nrm2(int n, +double HostBlas >::nrm2(KK_INT n, const std::complex* x, - int x_inc) { + KK_INT x_inc) { return F77_FUNC_DZNRM2(&n, x, &x_inc); } template <> -double HostBlas >::asum(int n, +double HostBlas >::asum(KK_INT n, const std::complex* x, - int x_inc) { + KK_INT x_inc) { return F77_FUNC_DZASUM(&n, x, &x_inc); } template <> std::complex HostBlas >::dot( - int n, const std::complex* x, int x_inc, - const std::complex* y, int y_inc) { + KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) _kk_double2 res = F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); return std::complex(res.vals[0], res.vals[1]); @@ -999,20 +1011,18 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(int n, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, - int x_inc, + KK_INT x_inc, /* */ std::complex* y, - int y_inc) { + KK_INT y_inc) { F77_FUNC_ZAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot(int const N, std::complex* X, - int const incx, - std::complex* Y, - int const incy, double* c, - double* s) { +void HostBlas >::rot( + KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy, double* c, double* s) { F77_FUNC_ZROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -1022,36 +1032,37 @@ void HostBlas >::rotg(std::complex* a, F77_FUNC_ZROTG(a, b, c, s); } template <> -void HostBlas >::swap(int const N, std::complex* X, - int const incx, +void HostBlas >::swap(KK_INT const N, + std::complex* X, + KK_INT const incx, std::complex* Y, - int const incy) { + KK_INT const incy) { F77_FUNC_ZSWAP(&N, X, &incx, Y, &incy); } template <> void HostBlas >::gemv( - const char trans, int m, int n, const std::complex alpha, - const std::complex* a, int lda, const std::complex* b, - int ldb, const std::complex beta, - /* */ std::complex* c, int ldc) { + const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_ZGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> void HostBlas >::geru( - int m, int n, const std::complex alpha, - const std::complex* x, int incx, const std::complex* y, - int incy, std::complex* a, int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> void HostBlas >::gerc( - int m, int n, const std::complex alpha, - const std::complex* x, int incx, const std::complex* y, - int incy, std::complex* a, int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); @@ -1059,27 +1070,28 @@ void HostBlas >::gerc( template <> template <> void HostBlas >::zher( - const char uplo, int n, const double alpha, const std::complex* x, - int incx, std::complex* a, int lda) { + const char uplo, KK_INT n, const double alpha, + const std::complex* x, KK_INT incx, std::complex* a, + KK_INT lda) { F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> void HostBlas >::zher2( - const char uplo, int n, const std::complex alpha, - const std::complex* x, int incx, const std::complex* y, - int incy, std::complex* a, int lda) { + const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> void HostBlas >::trsv(const char uplo, const char transa, - const char diag, int m, + const char diag, KK_INT m, const std::complex* a, - int lda, + KK_INT lda, /* */ std::complex* b, - int ldb) { + KK_INT ldb) { F77_FUNC_ZTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } @@ -1097,30 +1109,30 @@ void HostBlas >::gemm( } template <> void HostBlas >::herk( - const char transa, const char transb, int n, int k, - const std::complex alpha, const std::complex* a, int lda, + const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, const std::complex beta, - /* */ std::complex* c, int ldc) { + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_ZHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, &ldc); } template <> void HostBlas >::trmm( - const char side, const char uplo, const char transa, const char diag, int m, - int n, const std::complex alpha, const std::complex* a, - int lda, - /* */ std::complex* b, int ldb) { + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> void HostBlas >::trsm( - const char side, const char uplo, const char transa, const char diag, int m, - int n, const std::complex alpha, const std::complex* a, - int lda, - /* */ std::complex* b, int ldb) { + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 8e8781bfcf..5fb7c1f624 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -43,87 +43,88 @@ struct HostBlas { typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; - static void scal(int n, const T alpha, - /* */ T *x, int x_inc); + static void scal(KK_INT n, const T alpha, + /* */ T *x, KK_INT x_inc); - static int iamax(int n, const T *x, int x_inc); + static KK_INT iamax(KK_INT n, const T *x, KK_INT x_inc); - static mag_type nrm2(int n, const T *x, int x_inc); + static mag_type nrm2(KK_INT n, const T *x, KK_INT x_inc); - static mag_type asum(int n, const T *x, int x_inc); + static mag_type asum(KK_INT n, const T *x, KK_INT x_inc); - static T dot(int n, const T *x, int x_inc, const T *y, int y_inc); + static T dot(KK_INT n, const T *x, KK_INT x_inc, const T *y, KK_INT y_inc); - static void axpy(int n, const T alpha, const T *x, int x_inc, - /* */ T *y, int y_inc); + static void axpy(KK_INT n, const T alpha, const T *x, KK_INT x_inc, + /* */ T *y, KK_INT y_inc); - static void rot(int const N, T *X, int const incx, T *Y, int const incy, - mag_type *c, mag_type *s); + static void rot(KK_INT const N, T *X, KK_INT const incx, T *Y, + KK_INT const incy, mag_type *c, mag_type *s); static void rotg(T *a, T *b, mag_type *c, T *s); - static void rotm(const int n, T *X, const int incx, T *Y, const int incy, - T const *param); + static void rotm(const KK_INT n, T *X, const KK_INT incx, T *Y, + const KK_INT incy, T const *param); static void rotmg(T *d1, T *d2, T *x1, const T *y1, T *param); - static void swap(int const N, T *X, int const incx, T *Y, int const incy); + static void swap(KK_INT const N, T *X, KK_INT const incx, T *Y, + KK_INT const incy); - static void gemv(const char trans, int m, int n, const T alpha, const T *a, - int lda, const T *b, int ldb, const T beta, - /* */ T *c, int ldc); + static void gemv(const char trans, KK_INT m, KK_INT n, const T alpha, + const T *a, KK_INT lda, const T *b, KK_INT ldb, const T beta, + /* */ T *c, KK_INT ldc); - static void ger(int m, int n, const T alpha, const T *x, int incx, const T *y, - int incy, T *a, int lda); + static void ger(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, + const T *y, KK_INT incy, T *a, KK_INT lda); - static void geru(int m, int n, const T alpha, const T *x, int incx, - const T *y, int incy, T *a, int lda); + static void geru(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, + const T *y, KK_INT incy, T *a, KK_INT lda); - static void gerc(int m, int n, const T alpha, const T *x, int incx, - const T *y, int incy, T *a, int lda); + static void gerc(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, + const T *y, KK_INT incy, T *a, KK_INT lda); - static void syr(const char uplo, int n, const T alpha, const T *x, int incx, - T *a, int lda); + static void syr(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); - static void syr2(const char uplo, int n, const T alpha, const T *x, int incx, - const T *y, int incy, T *a, int lda); + static void syr2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); template - static void cher(const char uplo, int n, const tAlpha alpha, const T *x, - int incx, T *a, int lda); + static void cher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); template - static void zher(const char uplo, int n, const tAlpha alpha, const T *x, - int incx, T *a, int lda); + static void zher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); - static void cher2(const char uplo, int n, const T alpha, const T *x, int incx, - const T *y, int incy, T *a, int lda); + static void cher2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); - static void zher2(const char uplo, int n, const T alpha, const T *x, int incx, - const T *y, int incy, T *a, int lda); + static void zher2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); - static void trsv(const char uplo, const char transa, const char diag, int m, - const T *a, int lda, - /* */ T *b, int ldb); + static void trsv(const char uplo, const char transa, const char diag, + KK_INT m, const T *a, KK_INT lda, + /* */ T *b, KK_INT ldb); static void gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, const T alpha, const T *a, KK_INT lda, const T *b, KK_INT ldb, const T beta, /* */ T *c, KK_INT ldc); - static void herk(const char transa, const char transb, int n, int k, - const T alpha, const T *a, int lda, const T beta, - /* */ T *c, int ldc); + static void herk(const char transa, const char transb, KK_INT n, KK_INT k, + const T alpha, const T *a, KK_INT lda, const T beta, + /* */ T *c, KK_INT ldc); static void trmm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const T alpha, const T *a, - int lda, - /* */ T *b, int ldb); + const char diag, KK_INT m, KK_INT n, const T alpha, + const T *a, KK_INT lda, + /* */ T *b, KK_INT ldb); static void trsm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const T alpha, const T *a, - int lda, - /* */ T *b, int ldb); + const char diag, KK_INT m, KK_INT n, const T alpha, + const T *a, KK_INT lda, + /* */ T *b, KK_INT ldb); }; } // namespace Impl } // namespace KokkosBlas From 62aecceb9704ddcb300f6793784cc6f41dd38e5d Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 20 Feb 2024 16:27:57 -0700 Subject: [PATCH 167/326] Fix weird Trilinos compiler error It seemed to have a problem with these deep_copies, so just do the copy by hand like it was being done before my recent trsv PR. --- sparse/impl/KokkosSparse_trsv_impl.hpp | 35 +++++++++++++++++--------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index 41f7083398..9adb029d12 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -44,6 +44,17 @@ struct TrsvWrap { using sview_1d = typename Kokkos::View; using STS = Kokkos::ArithTraits; + static inline void manual_copy(RangeMultiVectorType X, + DomainMultiVectorType Y) { + auto numRows = X.extent(0); + auto numVecs = X.extent(1); + for (decltype(numRows) i = 0; i < numRows; ++i) { + for (decltype(numVecs) j = 0; j < numVecs; ++j) { + X(i, j) = Y(i, j); + } + } + } + struct CommonUnblocked { CommonUnblocked(const lno_t block_size) { KK_REQUIRE_MSG(block_size == 1, @@ -183,7 +194,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); for (lno_t r = 0; r < numRows; ++r) { const offset_type beg = ptr(r); @@ -210,7 +221,7 @@ struct TrsvWrap { CommonOps co(block_size); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); for (lno_t r = 0; r < numRows; ++r) { auto A_rr = co.zero(); @@ -253,7 +264,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); // If lno_t is unsigned and numRows is 0, the loop // below will have entirely the wrong number of iterations. @@ -303,7 +314,7 @@ struct TrsvWrap { CommonOps co(block_size); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); // If lno_t is unsigned and numRows is 0, the loop // below will have entirely the wrong number of iterations. @@ -371,7 +382,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); // If lno_t is unsigned and numCols is 0, the loop // below will have entirely the wrong number of iterations. @@ -420,7 +431,7 @@ struct TrsvWrap { typename CrsMatrixType::index_type ind = A.graph.entries; typename CrsMatrixType::values_type val = A.values; - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); @@ -481,7 +492,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); for (lno_t c = 0; c < numCols; ++c) { const offset_type beg = ptr(c); @@ -510,7 +521,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); // If lno_t is unsigned and numCols is 0, the loop // below will have entirely the wrong number of iterations. @@ -562,7 +573,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); // If lno_t is unsigned and numCols is 0, the loop // below will have entirely the wrong number of iterations. @@ -620,7 +631,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); for (lno_t c = 0; c < numCols; ++c) { const offset_type beg = ptr(c); @@ -657,7 +668,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); for (lno_t c = 0; c < numCols; ++c) { const offset_type beg = ptr(c); @@ -686,7 +697,7 @@ struct TrsvWrap { KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); - Kokkos::deep_copy(X, Y); + manual_copy(X, Y); for (lno_t c = 0; c < numCols; ++c) { const offset_type beg = ptr(c); From 706bf972602d25bbacc6a45d371136e72e96e5bd Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 17 Jan 2024 13:15:48 -0700 Subject: [PATCH 168/326] Update changelog --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59c3f5a647..46607df9d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Change Log +## [4.2.01](https://github.com/kokkos/kokkos-kernels/tree/4.2.01) (2024-01-17) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.2.00...4.2.01) + +### Bug Fixes: + +- LAPACK: magma tpl fixes [\#2044](https://github.com/kokkos/kokkos-kernels/pull/2044) +- BLAS: fix bug in TPL layer of `KokkosBlas::swap` [\#2052](https://github.com/kokkos/kokkos-kernels/pull/2052) + ## [4.2.00](https://github.com/kokkos/kokkos-kernels/tree/4.2.00) (2023-11-06) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.1.00...4.2.00) From 06646f4cc067a11400a76c420221ffc25b843a1b Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 25 Jan 2024 11:22:02 -0700 Subject: [PATCH 169/326] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46607df9d7..6c35eda7d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - LAPACK: magma tpl fixes [\#2044](https://github.com/kokkos/kokkos-kernels/pull/2044) - BLAS: fix bug in TPL layer of `KokkosBlas::swap` [\#2052](https://github.com/kokkos/kokkos-kernels/pull/2052) +- ROCm 6 deprecation fixes for rocsparse [\#2050](https://github.com/kokkos/kokkos-kernels/pull/2050) ## [4.2.00](https://github.com/kokkos/kokkos-kernels/tree/4.2.00) (2023-11-06) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.1.00...4.2.00) From 0ddc7449b0954ae2f6c9bbb2ad7e1a12a557d428 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 21 Feb 2024 11:14:27 -0700 Subject: [PATCH 170/326] Block spiluk follow up (#2085) * Fix for gemm * Remove unused divide method * Enhancements to spiluk test * Progress. Block spiluk now checks out against analytical results * LUPrec test with spiluk woring * Disable spiluk LU test on non-host * Enhancements to spiluk test * Clean up a few issues uncovered by gh review --- perf_test/sparse/KokkosSparse_spiluk.cpp | 6 - .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 480 +++++--------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 8 +- sparse/src/KokkosSparse_LUPrec.hpp | 89 ++- sparse/src/KokkosSparse_spiluk_handle.hpp | 17 - sparse/unit_test/Test_Sparse_spiluk.hpp | 588 +++++++++++++----- sparse/unit_test/Test_vector_fixtures.hpp | 5 +- 7 files changed, 653 insertions(+), 540 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index 331ae9ec82..c85b126019 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -144,12 +144,6 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, // std::cout << "Create handle" << std::endl; switch (test) { - case LVLSCHED_RP: - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, - EXPAND_FACT * nnz * (fill_lev + 1), - EXPAND_FACT * nnz * (fill_lev + 1)); - kh.get_spiluk_handle()->print_algorithm(); - break; case LVLSCHED_TP1: kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, EXPAND_FACT * nnz * (fill_lev + 1), diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 9484a02c11..b3b5dfa277 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -58,7 +58,6 @@ struct IlukWrap { using team_policy = typename IlukHandle::TeamPolicy; using member_type = typename team_policy::member_type; using range_policy = typename IlukHandle::RangePolicy; - using sview_1d = typename Kokkos::View; static team_policy get_team_policy(const size_type nrows, const int team_size) { @@ -68,6 +67,7 @@ struct IlukWrap { } else { rv = team_policy(nrows, team_size); } + return rv; } @@ -80,17 +80,7 @@ struct IlukWrap { } else { rv = team_policy(exe_space, nrows, team_size); } - return rv; - } - static range_policy get_range_policy(const lno_t start, const lno_t end) { - range_policy rv(start, end); - return rv; - } - - static range_policy get_range_policy(execution_space exe_space, - const lno_t start, const lno_t end) { - range_policy rv(exe_space, start, end); return rv; } @@ -116,8 +106,7 @@ struct IlukWrap { WorkViewType iw; lno_t lev_start; - // unblocked does not require any buffer - static constexpr size_type BUFF_SIZE = 1; + using reftype = scalar_t &; Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, @@ -155,9 +144,6 @@ struct IlukWrap { } // lset_id - KOKKOS_INLINE_FUNCTION - void lset_id(const size_type nnz) const { L_values(nnz) = scalar_t(1.0); } - KOKKOS_INLINE_FUNCTION void lset_id(const member_type &team, const size_type nnz) const { // Not sure a Kokkos::single is really needed here since the @@ -167,9 +153,6 @@ struct IlukWrap { } // divide. lhs /= rhs - KOKKOS_INLINE_FUNCTION - void divide(scalar_t &lhs, const scalar_t &rhs) const { lhs /= rhs; } - KOKKOS_INLINE_FUNCTION void divide(const member_type &team, scalar_t &lhs, const scalar_t &rhs) const { @@ -177,15 +160,11 @@ struct IlukWrap { team.team_barrier(); } - // add. lhs += rhs + // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION - void add(scalar_t &lhs, const scalar_t &rhs) const { lhs += rhs; } - - // multiply: return (alpha * lhs) * rhs - KOKKOS_INLINE_FUNCTION - scalar_t multiply(const scalar_t &alpha, const scalar_t &lhs, - const scalar_t &rhs, scalar_t *) const { - return alpha * lhs * rhs; + void multiply_subtract(const scalar_t &A, const scalar_t &B, + scalar_t &C) const { + C -= A * B; } // lget @@ -205,6 +184,10 @@ struct IlukWrap { bool uequal(const size_type nnz, const scalar_t &value) const { return U_values(nnz) == value; } + + // print + KOKKOS_INLINE_FUNCTION + void print(const scalar_t &item) const { std::cout << item << std::endl; } }; // Partial specialization for block support @@ -228,32 +211,27 @@ struct IlukWrap { lno_t lev_start; size_type block_size; size_type block_items; - sview_1d ones; - // blocked requires a buffer to store gemm output - static constexpr size_type BUFF_SIZE = 128; + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; - using LValuesUnmanaged2DBlockType = Kokkos::View< - typename LValuesType::value_type **, - typename KokkosKernels::Impl::GetUnifiedLayout< - LValuesType>::array_layout, + using LBlock = Kokkos::View< + typename LValuesType::value_type **, Layout, typename LValuesType::device_type, Kokkos::MemoryTraits >; - using UValuesUnmanaged2DBlockType = Kokkos::View< - typename UValuesType::value_type **, - typename KokkosKernels::Impl::GetUnifiedLayout< - UValuesType>::array_layout, + using UBlock = Kokkos::View< + typename UValuesType::value_type **, Layout, typename UValuesType::device_type, Kokkos::MemoryTraits >; - using AValuesUnmanaged2DBlockType = Kokkos::View< - typename AValuesType::value_type **, - typename KokkosKernels::Impl::GetUnifiedLayout< - AValuesType>::array_layout, + using ABlock = Kokkos::View< + typename AValuesType::value_type **, Layout, typename AValuesType::device_type, Kokkos::MemoryTraits >; + using reftype = LBlock; + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, @@ -274,12 +252,9 @@ struct IlukWrap { iw(iw_), lev_start(lev_start_), block_size(block_size_), - block_items(block_size * block_size), - ones("ones", block_size) { - Kokkos::deep_copy(ones, 1.0); + block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); - KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); } // lset @@ -289,8 +264,7 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void lset(const size_type block, - const AValuesUnmanaged2DBlockType &rhs) const { + void lset(const size_type block, const ABlock &rhs) const { auto lblock = lget(block); for (size_type i = 0; i < block_size; ++i) { for (size_type j = 0; j < block_size; ++j) { @@ -306,8 +280,7 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void uset(const size_type block, - const AValuesUnmanaged2DBlockType &rhs) const { + void uset(const size_type block, const ABlock &rhs) const { auto ublock = uget(block); for (size_type i = 0; i < block_size; ++i) { for (size_type j = 0; j < block_size; ++j) { @@ -317,11 +290,6 @@ struct IlukWrap { } // lset_id - KOKKOS_INLINE_FUNCTION - void lset_id(const size_type block) const { - KokkosBatched::SerialSetIdentity::invoke(lget(block)); - } - KOKKOS_INLINE_FUNCTION void lset_id(const member_type &team, const size_type block) const { KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); @@ -329,19 +297,8 @@ struct IlukWrap { // divide. lhs /= rhs KOKKOS_INLINE_FUNCTION - void divide(const LValuesUnmanaged2DBlockType &lhs, - const UValuesUnmanaged2DBlockType &rhs) const { - KokkosBatched::SerialTrsm< - KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, // not 100% on this - KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>:: // not 100% on this - invoke(1.0, rhs, lhs); - } - - KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, const LValuesUnmanaged2DBlockType &lhs, - const UValuesUnmanaged2DBlockType &rhs) const { + void divide(const member_type &team, const LBlock &lhs, + const UBlock &rhs) const { KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, // not 100% on this @@ -350,47 +307,38 @@ struct IlukWrap { invoke(team, 1.0, rhs, lhs); } - // add. lhs += rhs - template - KOKKOS_INLINE_FUNCTION void add(Lview lhs, const Rview &rhs) const { - KokkosBatched::SerialAxpy::invoke(ones, rhs, lhs); - } - - // multiply: return (alpha * lhs) * rhs - KOKKOS_INLINE_FUNCTION - LValuesUnmanaged2DBlockType multiply(const scalar_t &alpha, - const UValuesUnmanaged2DBlockType &lhs, - const LValuesUnmanaged2DBlockType &rhs, - scalar_t *buff) const { - LValuesUnmanaged2DBlockType result(&buff[0], block_size, block_size); - KokkosBatched::SerialGemm:: - invoke( - alpha, lhs, rhs, 0.0, result); - return result; + // multiply_subtract. C -= A * B + template + KOKKOS_INLINE_FUNCTION void multiply_subtract(const UBlock &A, + const LBlock &B, + CView &C) const { + // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Unblocked>::invoke( + -1.0, A, B, 1.0, C); } // lget KOKKOS_INLINE_FUNCTION - LValuesUnmanaged2DBlockType lget(const size_type block) const { - return LValuesUnmanaged2DBlockType( - L_values.data() + (block * block_items), block_size, block_size); + LBlock lget(const size_type block) const { + return LBlock(L_values.data() + (block * block_items), block_size, + block_size); } // uget KOKKOS_INLINE_FUNCTION - UValuesUnmanaged2DBlockType uget(const size_type block) const { - return UValuesUnmanaged2DBlockType( - U_values.data() + (block * block_items), block_size, block_size); + UBlock uget(const size_type block) const { + return UBlock(U_values.data() + (block * block_items), block_size, + block_size); } // aget KOKKOS_INLINE_FUNCTION - AValuesUnmanaged2DBlockType aget(const size_type block) const { - return AValuesUnmanaged2DBlockType( - A_values.data() + (block * block_items), block_size, block_size); + ABlock aget(const size_type block) const { + return ABlock(A_values.data() + (block * block_items), block_size, + block_size); } // uequal @@ -406,102 +354,17 @@ struct IlukWrap { } return true; } - }; - - template - struct ILUKLvlSchedRPNumericFunctor - : public Common { - using Base = Common; - ILUKLvlSchedRPNumericFunctor( - const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const lno_t &lev_start_, - const size_type &block_size_ = 0) - : Base(A_row_map_, A_entries_, A_values_, L_row_map_, L_entries_, - L_values_, U_row_map_, U_entries_, U_values_, level_idx_, iw_, - lev_start_, block_size_) {} - - KOKKOS_FUNCTION - void operator()(const lno_t i) const { - scalar_t buff[Base::BUFF_SIZE]; - - const auto rowid = Base::level_idx(i); - const auto tid = i - Base::lev_start; - auto k1 = Base::L_row_map(rowid); - auto k2 = Base::L_row_map(rowid + 1) - 1; - Base::lset_id(k2); - for (auto k = k1; k < k2; ++k) { - const auto col = Base::L_entries(k); - Base::lset(k, 0.0); - Base::iw(tid, col) = k; - } - - k1 = Base::U_row_map(rowid); - k2 = Base::U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - const auto col = Base::U_entries(k); - Base::uset(k, 0.0); - Base::iw(tid, col) = k; - } - - k1 = Base::A_row_map(rowid); - k2 = Base::A_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - const auto col = Base::A_entries(k); - const auto ipos = Base::iw(tid, col); - if (col < rowid) { - Base::lset(ipos, Base::aget(k)); - } else { - Base::uset(ipos, Base::aget(k)); + // print + KOKKOS_INLINE_FUNCTION + void print(const LBlock &item) const { + for (size_type i = 0; i < block_size; ++i) { + std::cout << " "; + for (size_type j = 0; j < block_size; ++j) { + std::cout << item(i, j) << " "; } + std::cout << std::endl; } - - // Eliminate prev rows - k1 = Base::L_row_map(rowid); - k2 = Base::L_row_map(rowid + 1) - 1; - for (auto k = k1; k < k2; ++k) { - const auto prev_row = Base::L_entries(k); - const auto u_diag = Base::uget(Base::U_row_map(prev_row)); - Base::divide(Base::lget(k), u_diag); - auto fact = Base::lget(k); - for (auto kk = Base::U_row_map(prev_row) + 1; - kk < Base::U_row_map(prev_row + 1); ++kk) { - const auto col = Base::U_entries(kk); - const auto ipos = Base::iw(tid, col); - if (ipos == -1) continue; - const auto lxu = Base::multiply(-1.0, Base::uget(kk), fact, &buff[0]); - if (col < rowid) { - Base::add(Base::lget(ipos), lxu); - } else { - Base::add(Base::uget(ipos), lxu); - } - } // end for kk - } // end for k - - const auto ipos = Base::iw(tid, rowid); - if (Base::uequal(ipos, 0.0)) { - Base::uset(ipos, 1e6); - } - - // Reset - k1 = Base::L_row_map(rowid); - k2 = Base::L_row_map(rowid + 1) - 1; - for (auto k = k1; k < k2; ++k) Base::iw(tid, Base::L_entries(k)) = -1; - - k1 = Base::U_row_map(rowid); - k2 = Base::U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) Base::iw(tid, Base::U_entries(k)) = -1; } }; @@ -534,6 +397,9 @@ struct IlukWrap { const auto my_team = team.league_rank(); const auto rowid = Base::level_idx(my_team + Base::lev_start); // map to rowid + + // Set active entries in L to zero, store active cols in iw + // Set L diagonal for this row to identity size_type k1 = Base::L_row_map(rowid); size_type k2 = Base::L_row_map(rowid + 1) - 1; Base::lset_id(team, k2); @@ -546,6 +412,7 @@ struct IlukWrap { team.team_barrier(); + // Set active entries in U to zero, store active cols in iw k1 = Base::U_row_map(rowid); k2 = Base::U_row_map(rowid + 1); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), @@ -557,7 +424,7 @@ struct IlukWrap { team.team_barrier(); - // Unpack the ith row of A + // Unpack the rowid-th row of A, copy into L,U k1 = Base::A_row_map(rowid); k2 = Base::A_row_map(rowid + 1); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), @@ -588,13 +455,9 @@ struct IlukWrap { const auto col = Base::U_entries(kk); const auto ipos = Base::iw(my_team, col); if (ipos != -1) { - scalar_t buff[Base::BUFF_SIZE]; - auto lxu = Base::multiply(-1.0, Base::uget(kk), fact, &buff[0]); - if (col < rowid) { - Base::add(Base::lget(ipos), lxu); - } else { - Base::add(Base::uget(ipos), lxu); - } + typename Base::reftype C = + col < rowid ? Base::lget(ipos) : Base::uget(ipos); + Base::multiply_subtract(fact, Base::uget(kk), C); } }); // end for kk @@ -657,14 +520,13 @@ struct IlukWrap { const URowMapType &U_row_map, const UEntriesType &U_entries, UValuesType &U_values) { - using RPF = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, false); - using RPB = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, true); using TPF = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, false); using TPB = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, true); - size_type nlevels = thandle.get_num_levels(); - int team_size = thandle.get_team_size(); - const auto block_size = thandle.get_block_size(); + size_type nlevels = thandle.get_num_levels(); + int team_size = thandle.get_team_size(); + const auto block_size = thandle.get_block_size(); + const auto block_enabled = thandle.is_block_enabled(); LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); LevelViewType level_idx = thandle.get_level_idx(); @@ -672,12 +534,9 @@ struct IlukWrap { LevelHostViewType level_nchunks_h, level_nrowsperchunk_h; WorkViewType iw; - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_nchunks_h = thandle.get_level_nchunks(); - level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); - } - iw = thandle.get_iw(); + level_nchunks_h = thandle.get_level_nchunks(); + level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); + iw = thandle.get_iw(); // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead @@ -686,39 +545,26 @@ struct IlukWrap { lno_t lev_end = level_ptr_h(lvl + 1); if ((lev_end - lev_start) != 0) { - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - range_policy rpolicy = get_range_policy(lev_start, lev_end); + lno_t lvl_rowid_start = 0; + lno_t lvl_nrows_chunk; + for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { + if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > + (lev_end - lev_start)) + lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + + team_policy tpolicy = get_team_policy(lvl_nrows_chunk, team_size); KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, - rpolicy, "parfor_fixed_lvl", level_idx, iw, - lev_start, RPF, RPB, thandle.is_block_enabled(), - block_size); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm:: - SEQLVLSCHD_TP1) { - lno_t lvl_rowid_start = 0; - lno_t lvl_nrows_chunk; - for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { - if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > - (lev_end - lev_start)) - lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; - else - lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - - team_policy tpolicy = get_team_policy(lvl_nrows_chunk, team_size); - KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, - U_values, tpolicy, "parfor_tp1", level_idx, iw, - lev_start + lvl_rowid_start, TPF, TPB, - thandle.is_block_enabled(), block_size); - Kokkos::fence(); - lvl_rowid_start += lvl_nrows_chunk; - } + tpolicy, "parfor_tp1", level_idx, iw, + lev_start + lvl_rowid_start, TPF, TPB, + block_enabled, block_size); + Kokkos::fence(); + lvl_rowid_start += lvl_nrows_chunk; } } // end if } // end for lvl - //} // Output check #ifdef NUMERIC_OUTPUT_INFO @@ -781,8 +627,6 @@ struct IlukWrap { const std::vector &U_row_map_v, const std::vector &U_entries_v, std::vector &U_values_v) { - using RPF = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, false); - using RPB = FunctorTypeMacro(ILUKLvlSchedRPNumericFunctor, true); using TPF = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, false); using TPB = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, true); @@ -811,111 +655,71 @@ struct IlukWrap { if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; } - // Assume all streams use the same algorithm - if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // Initial work across streams at each level - for (int i = 0; i < nstreams; i++) { - // Only do this if this stream has this level - if (lvl < nlevels_v[i]) { - lvl_start_v[i] = lvl_ptr_h_v[i](lvl); - lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); - if ((lvl_end_v[i] - lvl_start_v[i]) != 0) - stream_have_level_v[i] = true; - else - stream_have_level_v[i] = false; - } else - stream_have_level_v[i] = false; - } + std::vector lvl_nchunks_h_v(nstreams); + std::vector lvl_nrowsperchunk_h_v(nstreams); + std::vector lvl_rowid_start_v(nstreams); + std::vector team_size_v(nstreams); - // Main work of the level across streams - // 1. Launch work on all streams - for (int i = 0; i < nstreams; i++) { - // Launch only if stream i-th has this level - if (stream_have_level_v[i]) { - range_policy rpolicy = - get_range_policy(execspace_v[i], lvl_start_v[i], lvl_end_v[i]); - KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], - rpolicy, "parfor_rp", lvl_idx_v[i], iw_v[i], - lvl_start_v[i], RPF, RPB, is_block_enabled_v[i], - block_size_v[i]); - } // end if (stream_have_level_v[i]) - } // end for streams - } // end for lvl - } // end SEQLVLSCHD_RP - else if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - std::vector lvl_nchunks_h_v(nstreams); - std::vector lvl_nrowsperchunk_h_v(nstreams); - std::vector lvl_rowid_start_v(nstreams); - std::vector team_size_v(nstreams); + for (int i = 0; i < nstreams; i++) { + lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); + lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); + team_size_v[i] = thandle_v[i]->get_team_size(); + } + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + lno_t lvl_nchunks_max = 0; for (int i = 0; i < nstreams; i++) { - lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); - lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); - team_size_v[i] = thandle_v[i]->get_team_size(); - } - - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // Initial work across streams at each level - lno_t lvl_nchunks_max = 0; - for (int i = 0; i < nstreams; i++) { - // Only do this if this stream has this level - if (lvl < nlevels_v[i]) { - lvl_start_v[i] = lvl_ptr_h_v[i](lvl); - lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); - if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { - stream_have_level_v[i] = true; - lvl_rowid_start_v[i] = 0; - if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) - lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); - } else - stream_have_level_v[i] = false; + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { + stream_have_level_v[i] = true; + lvl_rowid_start_v[i] = 0; + if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) + lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); } else stream_have_level_v[i] = false; - } + } else + stream_have_level_v[i] = false; + } - // Main work of the level across streams -- looping through chunnks - for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { - // 1. Launch work on all streams (for each chunk) - for (int i = 0; i < nstreams; i++) { - // Launch only if stream i-th has this level - if (stream_have_level_v[i]) { - // Launch only if stream i-th has this chunk - if (chunkid < lvl_nchunks_h_v[i](lvl)) { - // 1.a. Specify number of rows (i.e. number of teams) to launch - lno_t lvl_nrows_chunk = 0; - if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > - (lvl_end_v[i] - lvl_start_v[i])) - lvl_nrows_chunk = - (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; - else - lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); - - // 1.b. Create functor for stream i-th and launch - team_policy tpolicy = get_team_policy( - execspace_v[i], lvl_nrows_chunk, team_size_v[i]); - KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], - tpolicy, "parfor_tp1", lvl_idx_v[i], iw_v[i], - lvl_start_v[i] + lvl_rowid_start_v[i], TPF, - TPB, is_block_enabled_v[i], block_size_v[i]); - // 1.c. Ready to move to next chunk - lvl_rowid_start_v[i] += lvl_nrows_chunk; - } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) - } // end if (stream_have_level_v[i]) - } // end for streams - } // end for chunkid - } // end for lvl - } // end SEQLVLSCHD_TP1 - - } // end iluk_numeric_streams + // Main work of the level across streams -- looping through chunnks + for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { + // 1. Launch work on all streams (for each chunk) + for (int i = 0; i < nstreams; i++) { + // Launch only if stream i-th has this level + if (stream_have_level_v[i]) { + // Launch only if stream i-th has this chunk + if (chunkid < lvl_nchunks_h_v[i](lvl)) { + // 1.a. Specify number of rows (i.e. number of teams) to launch + lno_t lvl_nrows_chunk = 0; + if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > + (lvl_end_v[i] - lvl_start_v[i])) + lvl_nrows_chunk = + (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; + else + lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); + + // 1.b. Create functor for stream i-th and launch + team_policy tpolicy = get_team_policy( + execspace_v[i], lvl_nrows_chunk, team_size_v[i]); + KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + tpolicy, "parfor_tp1", lvl_idx_v[i], iw_v[i], + lvl_start_v[i] + lvl_rowid_start_v[i], TPF, TPB, + is_block_enabled_v[i], block_size_v[i]); + // 1.c. Ready to move to next chunk + lvl_rowid_start_v[i] += lvl_nrows_chunk; + } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) + } // end if (stream_have_level_v[i]) + } // end for streams + } // end for chunkid + } // end for lvl + } // end iluk_numeric_streams }; // IlukWrap diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 9e40c23af7..31c2494cdd 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -229,9 +229,7 @@ void iluk_symbolic(IlukHandle& thandle, LEntriesType& L_entries_d, URowMapType& U_row_map_d, UEntriesType& U_entries_d, int nstreams = 1) { if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP || - thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) /* || thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHED_TP2 )*/ { @@ -379,7 +377,7 @@ void iluk_symbolic(IlukHandle& thandle, std::ostringstream os; os << "KokkosSparse::Experimental::spiluk_symbolic: U_entries's extent " "must be larger than " - << U_entries_d.extent(0); + << U_entries_d.extent(0) << ", must be at least " << cntU + lenu + 1; KokkosKernels::Impl::throw_runtime_exception(os.str()); } // U diag entry @@ -401,7 +399,7 @@ void iluk_symbolic(IlukHandle& thandle, std::ostringstream os; os << "KokkosSparse::Experimental::spiluk_symbolic: L_entries's extent " "must be larger than " - << L_entries_d.extent(0); + << L_entries_d.extent(0) << ", must be at least " << cntL + lenl + 1; KokkosKernels::Impl::throw_runtime_exception(os.str()); } for (size_type k = 0; k < lenl; ++k) { diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index a257b8f09c..d687c8dd4f 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace KokkosSparse { namespace Experimental { @@ -45,8 +46,9 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using MEMSP = typename CRS::memory_space; + using DEVICE = typename Kokkos::Device; using karith = typename Kokkos::ArithTraits; - using View1d = typename Kokkos::View; + using View1d = typename Kokkos::View; private: // trsm takes host views @@ -61,11 +63,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { LUPrec(const CRSArg &L, const CRSArg &U) : _L(L), _U(U), - _tmp("LUPrec::_tmp", L.numRows()), - _tmp2("LUPrec::_tmp", L.numRows()), + _tmp("LUPrec::_tmp", L.numPointRows()), + _tmp2("LUPrec::_tmp", L.numPointRows()), _khL(), _khU() { - KK_REQUIRE_MSG(L.numRows() == U.numRows(), + KK_REQUIRE_MSG(L.numPointRows() == U.numPointRows(), "LUPrec: L.numRows() != U.numRows()"); _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), @@ -80,22 +82,13 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { _khU.destroy_sptrsv_handle(); } - ///// \brief Apply the preconditioner to X, putting the result in Y. - ///// - ///// \tparam XViewType Input vector, as a 1-D Kokkos::View - ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View - ///// - ///// \param transM [in] Not used. - ///// \param alpha [in] Not used - ///// \param beta [in] Not used. - ///// - ///// It takes L and U and the stores U^inv L^inv X in Y - // - virtual void apply( - const Kokkos::View> &X, - const Kokkos::View> &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + template < + typename Matrix, + typename std::enable_if::value>::type * = nullptr> + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp @@ -111,6 +104,62 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { KokkosBlas::axpby(alpha, _tmp2, beta, Y); } + + template < + typename Matrix, + typename std::enable_if::value>::type * = nullptr> + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + // tmp = trsv(L, x); //Apply L^inv to x + // y = trsv(U, tmp); //Apply U^inv to tmp + + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], + "LUPrec::apply only supports 'N' for transM"); + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + using Layout = Kokkos::LayoutLeft; +#else + using Layout = Kokkos::LayoutRight; +#endif + + // trsv is implemented for MV so we need to convert our views + using UView2d = typename Kokkos::View< + ScalarType **, Layout, DEVICE, + Kokkos::MemoryTraits >; + using UView2dc = typename Kokkos::View< + const ScalarType **, Layout, DEVICE, + Kokkos::MemoryTraits >; + UView2dc X2d(X.data(), X.extent(0), 1); + UView2d Y2d(Y.data(), Y.extent(0), 1), + tmp2d(_tmp.data(), _tmp.extent(0), 1), + tmp22d(_tmp2.data(), _tmp2.extent(0), 1); + + KokkosSparse::trsv("L", "N", "N", _L, X2d, tmp2d); + KokkosSparse::trsv("U", "N", "N", _U, tmp2d, tmp22d); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); + } + + ///// \brief Apply the preconditioner to X, putting the result in Y. + ///// + ///// \tparam XViewType Input vector, as a 1-D Kokkos::View + ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View + ///// + ///// \param transM [in] Not used. + ///// \param alpha [in] Not used + ///// \param beta [in] Not used. + ///// + ///// It takes L and U and the stores U^inv L^inv X in Y + // + virtual void apply(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", + ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + apply_impl(X, Y, transM, alpha, beta); + } //@} //! Set this preconditioner's parameters. diff --git a/sparse/src/KokkosSparse_spiluk_handle.hpp b/sparse/src/KokkosSparse_spiluk_handle.hpp index 2b37d08f6e..952a14aa2d 100644 --- a/sparse/src/KokkosSparse_spiluk_handle.hpp +++ b/sparse/src/KokkosSparse_spiluk_handle.hpp @@ -29,7 +29,6 @@ namespace Experimental { // TP2 algorithm has issues with some offset-ordinal combo to be addressed enum class SPILUKAlgorithm { - SEQLVLSCHD_RP, SEQLVLSCHD_TP1 /*, SEQLVLSCHED_TP2*/ }; @@ -256,9 +255,6 @@ class SPILUKHandle { int get_vector_size() const { return this->vector_size; } void print_algorithm() { - if (algm == SPILUKAlgorithm::SEQLVLSCHD_RP) - std::cout << "SEQLVLSCHD_RP" << std::endl; - if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; @@ -269,19 +265,6 @@ class SPILUKHandle { } */ } - - inline SPILUKAlgorithm StringToSPILUKAlgorithm(std::string &name) { - if (name == "SPILUK_DEFAULT") - return SPILUKAlgorithm::SEQLVLSCHD_RP; - else if (name == "SPILUK_RANGEPOLICY") - return SPILUKAlgorithm::SEQLVLSCHD_RP; - else if (name == "SPILUK_TEAMPOLICY1") - return SPILUKAlgorithm::SEQLVLSCHD_TP1; - /*else if(name=="SPILUK_TEAMPOLICY2") return - * SPILUKAlgorithm::SEQLVLSCHED_TP2;*/ - else - throw std::runtime_error("Invalid SPILUKAlgorithm name"); - } }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 7d52d08ee6..08f41eefbb 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -28,10 +28,13 @@ #include "KokkosSparse_spiluk.hpp" #include "KokkosSparse_crs_to_bsr_impl.hpp" #include "KokkosSparse_bsr_to_crs_impl.hpp" +#include "KokkosSparse_LUPrec.hpp" +#include "KokkosSparse_gmres.hpp" #include "Test_vector_fixtures.hpp" #include +#include using namespace KokkosSparse; using namespace KokkosSparse::Experimental; @@ -41,10 +44,28 @@ using namespace KokkosKernels::Experimental; using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; +// Comment this out to do focussed debugging +#define TEST_SPILUK_FULL_CHECKS + +// Test verbosity level. 0 = none, 1 = print residuals, 2 = print L,U +#define TEST_SPILUK_VERBOSE_LEVEL 0 + +// #define TEST_SPILUK_TINY_TEST + namespace Test { +#ifdef TEST_SPILUK_TINY_TEST +template +std::vector> get_fixture() { + std::vector> A = {{10.00, 1.00, 0.00, 0.00}, + {0.00, 11.00, 0.00, 0.00}, + {0.00, 2.00, 12.00, 0.00}, + {5.00, 0.00, 3.00, 13.00}}; + return A; +} +#else template -std::vector> get_9x9_fixture() { +std::vector> get_fixture() { std::vector> A = { {10.00, 0.00, 0.30, 0.00, 0.00, 0.60, 0.00, 0.00, 0.00}, {0.00, 11.00, 0.00, 0.00, 0.00, 0.00, 0.70, 0.00, 0.00}, @@ -57,16 +78,49 @@ std::vector> get_9x9_fixture() { {0.00, 0.00, 0.00, 2.00, 2.50, 0.00, 0.00, 0.00, 18.00}}; return A; } +#endif + +template < + typename MatrixType, typename CRS, + typename std::enable_if::value>::type* = nullptr> +MatrixType get_A(CRS A_unblocked, const size_t) { + return A_unblocked; +} + +template < + typename MatrixType, typename CRS, + typename std::enable_if::value>::type* = nullptr> +MatrixType get_A(CRS A_unblocked, const size_t block_size) { + // Convert to BSR + MatrixType A(A_unblocked, block_size); -template -std::vector> get_4x4_fixture() { - std::vector> A = {{10.00, 1.00, 0.00, 0.00}, - {0.00, 11.00, 0.00, 0.00}, - {0.00, 2.00, 12.00, 0.00}, - {5.00, 0.00, 0.00, 13.00}}; return A; } +template < + typename MatrixType, typename RowMapType, typename EntriesType, + typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const size_t) { + const auto nrows = row_map.extent(0) - 1; + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, + entries); +} + +template < + typename MatrixType, typename RowMapType, typename EntriesType, + typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const size_t block_size) { + const auto nrows = row_map.extent(0) - 1; + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, + entries, block_size); +} + static constexpr double EPS = 1e-7; template ; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, @@ -119,6 +174,31 @@ struct SpilukTest { return diff_nrm / bb_nrm; } + static bool is_triangular(const RowMapType& drow_map, + const EntriesType& dentries, bool check_lower) { + const size_type nrows = drow_map.extent(0) - 1; + + auto row_map = Kokkos::create_mirror_view(drow_map); + auto entries = Kokkos::create_mirror_view(dentries); + Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(entries, dentries); + + for (size_type row = 0; row < nrows; ++row) { + const size_type row_nnz_begin = row_map(row); + const size_type row_nnz_end = row_map(row + 1); + for (size_type nnz = row_nnz_begin; nnz < row_nnz_end; ++nnz) { + const size_type col = entries(nnz); + if (col > row && check_lower) { + return false; + } else if (col < row && !check_lower) { + return false; + } + } + } + return true; + } + + template static void check_result(const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, const RowMapType& L_row_map, @@ -126,103 +206,70 @@ struct SpilukTest { const ValuesType& L_values, const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values) { - // Checking - const auto nrows = row_map.extent(0) - 1; - Crs A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries); - Crs L("L_Mtx", nrows, nrows, L_values.extent(0), L_values, L_row_map, - L_entries); - Crs U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, - U_entries); - - const auto result = check_result_impl(A, L, U, nrows); + const ValuesType& U_values, const lno_t fill_lev, + const size_type block_size = 1) { + using sp_matrix_type = std::conditional_t; - EXPECT_LT(result, 1e-4); - } + KK_REQUIRE(UseBlocks || (block_size == 1)); - static void check_result_block( - const RowMapType& row_map, const EntriesType& entries, - const ValuesType& values, const RowMapType& L_row_map, - const EntriesType& L_entries, const ValuesType& L_values, - const RowMapType& U_row_map, const EntriesType& U_entries, - const ValuesType& U_values, const size_type block_size) { // Checking const auto nrows = row_map.extent(0) - 1; - Bsr A("A_Mtx", nrows, nrows, values.extent(0), values, row_map, entries, - block_size); - Bsr L("L_Mtx", nrows, nrows, L_values.extent(0), L_values, L_row_map, - L_entries, block_size); - Bsr U("U_Mtx", nrows, nrows, U_values.extent(0), U_values, U_row_map, - U_entries, block_size); + auto A = make_matrix("A_Mtx", row_map, entries, values, + block_size); + auto L = make_matrix("L_Mtx", L_row_map, L_entries, + L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, + U_values, block_size); + + EXPECT_TRUE(is_triangular(L_row_map, L_entries, true)); + EXPECT_TRUE(is_triangular(U_row_map, U_entries, false)); const auto result = check_result_impl(A, L, U, nrows, block_size); - EXPECT_LT(result, 1e0); + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev; + if (UseBlocks) { + std::cout << ", block_size=" << block_size; + } else { + std::cout << ", unblocked"; + } + std::cout << " had residual: " << result << std::endl; + } + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "L result" << std::endl; + print_matrix( + decompress_matrix(L_row_map, L_entries, L_values, block_size)); + std::cout << "U result" << std::endl; + print_matrix( + decompress_matrix(U_row_map, U_entries, U_values, block_size)); + } + + if (fill_lev > 1) { + if (UseBlocks) { + EXPECT_LT(result, 1e-2); + } else { + EXPECT_LT(result, 1e-4); + } + } } + template static std::tuple run_and_check_spiluk(KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, const ValuesType& values, - SPILUKAlgorithm alg, const lno_t fill_lev) { - const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 4 * nrows, 4 * nrows); - - auto spiluk_handle = kh.get_spiluk_handle(); + SPILUKAlgorithm alg, const lno_t fill_lev, + const size_type block_size = 1) { + KK_REQUIRE(UseBlocks || (block_size == 1)); - // Allocate L and U as outputs - RowMapType L_row_map("L_row_map", nrows + 1); - EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - RowMapType U_row_map("U_row_map", nrows + 1); - EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - - spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, - U_row_map, U_entries); - - Kokkos::fence(); - - Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - - spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, - L_entries, L_values, U_row_map, U_entries, U_values); - - Kokkos::fence(); - - check_result(row_map, entries, values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values); - - kh.destroy_spiluk_handle(); - - // For team policy alg, check results against range policy - if (alg == SPILUKAlgorithm::SEQLVLSCHD_TP1) { - const auto [L_row_map_rp, L_entries_rp, L_values_rp, U_row_map_rp, - U_entries_rp, U_values_rp] = - run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev); - - EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(L_values, L_values_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_rp, EPS); - EXPECT_NEAR_KK_1DVIEW(U_values, U_values_rp, EPS); - } - - return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, - U_values); - } - - static void run_and_check_spiluk_block( - KernelHandle& kh, const RowMapType& row_map, const EntriesType& entries, - const ValuesType& values, SPILUKAlgorithm alg, const lno_t fill_lev, - const size_type block_size) { const size_type block_items = block_size * block_size; const size_type nrows = row_map.extent(0) - 1; - kh.create_spiluk_handle(alg, nrows, 4 * nrows, 4 * nrows, block_size); + kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, + !UseBlocks ? 0 : block_size); auto spiluk_handle = kh.get_spiluk_handle(); + if (TeamSize != -1) { + spiluk_handle->set_team_size(TeamSize); + } // Allocate L and U as outputs RowMapType L_row_map("L_row_map", nrows + 1); @@ -245,16 +292,19 @@ struct SpilukTest { Kokkos::fence(); - check_result_block(row_map, entries, values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, block_size); + check_result(row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, fill_lev, + block_size); kh.destroy_spiluk_handle(); +#ifdef TEST_SPILUK_FULL_CHECKS // If block_size is 1, results should exactly match unblocked results - if (block_size == 1) { + if (block_size == 1 && UseBlocks) { const auto [L_row_map_u, L_entries_u, L_values_u, U_row_map_u, U_entries_u, U_values_u] = - run_and_check_spiluk(kh, row_map, entries, values, alg, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, + alg, fill_lev); EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_u, EPS); EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_u, EPS); @@ -263,10 +313,34 @@ struct SpilukTest { EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_u, EPS); EXPECT_NEAR_KK_1DVIEW(U_values, U_values_u, EPS); } + + // Check that team size = 1 produces same result + if (TeamSize != 1) { + const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, + U_entries_ts1, U_values_ts1] = + run_and_check_spiluk(kh, row_map, entries, values, alg, + fill_lev, block_size); + + EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_values, L_values_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_values, U_values_ts1, EPS); + } +#endif + + return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values); } static void run_test_spiluk() { - std::vector> A = get_9x9_fixture(); + std::vector> A = get_fixture(); + + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "A input" << std::endl; + print_matrix(A); + } RowMapType row_map; EntriesType entries; @@ -278,10 +352,122 @@ struct SpilukTest { KernelHandle kh; - run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev); - run_and_check_spiluk(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + } + + static void run_test_spiluk_blocks() { + std::vector> A = get_fixture(); + + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "A input" << std::endl; + print_matrix(A); + } + + RowMapType row_map, brow_map; + EntriesType entries, bentries; + ValuesType values, bvalues; + + compress_matrix(row_map, entries, values, A); + + const size_type nrows = A.size(); + const size_type nnz = values.extent(0); + const lno_t fill_lev = 2; + const size_type block_size = nrows % 2 == 0 ? 2 : 3; + ASSERT_EQ(nrows % block_size, 0); + + KernelHandle kh; + + Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, + entries); + + std::vector block_sizes = {1, block_size}; + + for (auto block_size_itr : block_sizes) { + Bsr bsr(crs, block_size_itr); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size_itr); + } + } + + static void run_test_spiluk_scale() { + // Create a diagonally dominant sparse matrix to test: + constexpr auto nrows = 5000; + constexpr auto diagDominance = 2; + + size_type nnz = 10 * nrows; + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + KokkosSparse::sort_crs_matrix(A); + + // Pull out views from CRS + RowMapType row_map("row_map", A.graph.row_map.extent(0)); + EntriesType entries("entries", A.graph.entries.extent(0)); + ValuesType values("values", A.values.extent(0)); + Kokkos::deep_copy(row_map, A.graph.row_map); + Kokkos::deep_copy(entries, A.graph.entries); + Kokkos::deep_copy(values, A.values); + + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { + KernelHandle kh; + + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + } + } + + static void run_test_spiluk_scale_blocks() { + // Create a diagonally dominant sparse matrix to test: + constexpr auto nrows = 5000; + constexpr auto diagDominance = 2; + + RowMapType brow_map; + EntriesType bentries; + ValuesType bvalues; + + // const size_type block_size = 10; + + size_type nnz = 10 * nrows; + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + KokkosSparse::sort_crs_matrix(A); + + std::vector block_sizes = {1, 2, 4, 10}; + + for (auto block_size : block_sizes) { + // Convert to BSR + Bsr bsr(A, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { + KernelHandle kh; + + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); + } + } } static void run_test_spiluk_streams(SPILUKAlgorithm test_algo, int nstreams) { @@ -316,7 +502,7 @@ struct SpilukTest { std::vector U_entries_v(nstreams); std::vector U_values_v(nstreams); - std::vector> Afix = get_9x9_fixture(); + std::vector> Afix = get_fixture(); RowMapType row_map; EntriesType entries; @@ -383,9 +569,10 @@ struct SpilukTest { // Checking for (int i = 0; i < nstreams; i++) { - check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i]); + check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + fill_lev); kh_v[i].destroy_spiluk_handle(); } @@ -424,7 +611,7 @@ struct SpilukTest { std::vector U_entries_v(nstreams); std::vector U_values_v(nstreams); - std::vector> Afix = get_9x9_fixture(); + std::vector> Afix = get_fixture(); RowMapType row_map, brow_map; EntriesType entries, bentries; @@ -512,57 +699,158 @@ struct SpilukTest { // Checking for (int i = 0; i < nstreams; i++) { - check_result_block(A_row_map_v[i], A_entries_v[i], A_values_v[i], + check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], - block_size); + fill_lev, block_size); kh_v[i].destroy_spiluk_handle(); } } - static void run_test_spiluk_blocks() { - std::vector> A = get_9x9_fixture(); - - RowMapType row_map, brow_map; - EntriesType entries, bentries; - ValuesType values, bvalues; - - compress_matrix(row_map, entries, values, A); - - const size_type nrows = A.size(); - const size_type nnz = values.extent(0); - const lno_t fill_lev = 2; - const size_type block_size = nrows % 2 == 0 ? 2 : 3; - ASSERT_EQ(nrows % block_size, 0); - - KernelHandle kh; - - // Check block_size=1 produces identical result to unblocked - run_and_check_spiluk_block(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev, 1); - run_and_check_spiluk_block(kh, row_map, entries, values, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, 1); - - // Convert to BSR - Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, - entries); - Bsr bsr(crs, block_size); - - // Pull out views from BSR - Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); - Kokkos::resize(bentries, bsr.graph.entries.extent(0)); - Kokkos::resize(bvalues, bsr.values.extent(0)); - Kokkos::deep_copy(brow_map, bsr.graph.row_map); - Kokkos::deep_copy(bentries, bsr.graph.entries); - Kokkos::deep_copy(bvalues, bsr.values); + template + static void run_test_spiluk_precond() { + // Test using spiluk as a preconditioner + // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? + + // Create a diagonally dominant sparse matrix to test: + using sp_matrix_type = std::conditional_t; + + constexpr auto nrows = 5000; + constexpr auto m = 15; + constexpr auto diagDominance = 2; + constexpr auto tol = 1e-5; + constexpr bool verbose = false; + + if (UseBlocks) { + // Skip test if not on host. block trsv only works on host + static constexpr bool is_host = + std::is_same::value; + if (!is_host) { + return; + } + } - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_RP, fill_lev, - block_size); - run_and_check_spiluk_block(kh, brow_map, bentries, bvalues, - SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, - block_size); + RowMapType brow_map; + EntriesType bentries; + ValuesType bvalues; + + size_type nnz = 10 * nrows; + auto A_unblocked = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + KokkosSparse::sort_crs_matrix(A_unblocked); + + std::vector block_sizes_blocked = {1, 2, 4, 10}; + std::vector block_sizes_unblocked = {1}; + std::vector block_sizes = + UseBlocks ? block_sizes_blocked : block_sizes_unblocked; + + for (auto block_size : block_sizes) { + // Convert to BSR if block enabled + auto A = get_A(A_unblocked, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, A.graph.row_map.extent(0)); + Kokkos::resize(bentries, A.graph.entries.extent(0)); + Kokkos::resize(bvalues, A.values.extent(0)); + Kokkos::deep_copy(brow_map, A.graph.row_map); + Kokkos::deep_copy(bentries, A.graph.entries); + Kokkos::deep_copy(bvalues, A.values); + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + gmres_handle->set_verbose(verbose); + using GMRESHandle = + typename std::remove_reference::type; + + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { + const auto [L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values] = + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, + fill_lev, block_size); + + // Create L, U + auto L = make_matrix("L_Mtx", L_row_map, L_entries, + L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, + U_values, block_size); + + // Set initial vectors: + ValuesType X("X", nrows); // Solution and initial guess + ValuesType Wj("Wj", nrows); // For checking residuals at end. + ValuesType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + nrows); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + int num_iters_plain(0), num_iters_precond(0); + + // Solve Ax = b + { + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_plain = gmres_handle->get_num_iters(); + + EXPECT_GT(num_iters_plain, 0); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "Without LUPrec, with block_size=" << block_size + << ", converged in " << num_iters_plain + << " steps with endres=" << endRes << std::endl; + } + } + + // Solve Ax = b with LU preconditioner. + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); + + // Make precond. + KokkosSparse::Experimental::LUPrec + myPrec(L, U); + + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); + + gmres(&kh, A, B, X, &myPrec); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_precond = gmres_handle->get_num_iters(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + EXPECT_LT(num_iters_precond, num_iters_plain); + + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "With LUPrec, with block_size=" << block_size + << ", and fill_level=" << fill_lev << ", converged in " + << num_iters_precond << " steps with endres=" << endRes + << std::endl; + } + } + } + } } }; @@ -574,6 +862,10 @@ void test_spiluk() { using TestStruct = Test::SpilukTest; TestStruct::run_test_spiluk(); TestStruct::run_test_spiluk_blocks(); + TestStruct::run_test_spiluk_scale(); + TestStruct::run_test_spiluk_scale_blocks(); + TestStruct::template run_test_spiluk_precond(); + TestStruct::template run_test_spiluk_precond(); } template ; - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 1); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 2); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 3); - TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_RP, 4); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 1); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 2); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 3); - TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_RP, 4); TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, diff --git a/sparse/unit_test/Test_vector_fixtures.hpp b/sparse/unit_test/Test_vector_fixtures.hpp index 21b155970d..b17197a3e5 100644 --- a/sparse/unit_test/Test_vector_fixtures.hpp +++ b/sparse/unit_test/Test_vector_fixtures.hpp @@ -117,7 +117,8 @@ decompress_matrix(const RowMapT& row_map, const EntriesT& entries, template std::vector> decompress_matrix(const RowMapT& row_map, const EntriesT& entries, - const ValuesT& values, const int block_size) { + const ValuesT& values, + typename RowMapT::const_value_type block_size) { using size_type = typename RowMapT::non_const_value_type; using scalar_t = typename ValuesT::non_const_value_type; @@ -183,7 +184,7 @@ template void print_matrix(const std::vector>& matrix) { for (const auto& row : matrix) { for (const auto& item : row) { - std::printf("%.2f ", item); + std::printf("%.5f ", item); } std::cout << std::endl; } From d3ce803c2dfc8c9c3f480bc3dc6d52ffc77ef7f1 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 22 Feb 2024 13:35:16 -0700 Subject: [PATCH 171/326] github workflows: update to v4 (use Node 20) --- .github/workflows/docs.yml | 4 ++-- .github/workflows/format.yml | 2 +- .github/workflows/osx.yml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 558b6bd96d..04a1ba74b2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -23,12 +23,12 @@ jobs: doxygen --version - name: checkout_kokkos_kernels - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: kokkos/kokkos ref: develop diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 220461fe62..6e2db4031a 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install Dependencies run: sudo apt install clang-format-8 diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 8d9f7123f8..a2b2d8c830 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -50,12 +50,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: kokkos/kokkos ref: develop From f53f7c50d123fafa041b2a3118d108518dc0c8ab Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 22 Feb 2024 13:42:44 -0700 Subject: [PATCH 172/326] Refactor Test_Sparse_sptrsv (#2102) * Refactor Test_Sparse_sptrsv * More cleanups * Remove old commented-out code --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 13 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 3 +- sparse/unit_test/Test_Sparse_sptrsv.hpp | 1916 ++++++----------- sparse/unit_test/Test_vector_fixtures.hpp | 49 +- 4 files changed, 738 insertions(+), 1243 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 89370f0dc5..48c7d41a91 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -752,17 +752,8 @@ void test_gauss_seidel_streams_rank1( } #endif // KOKKOS_ENABLE_OPENMP - std::vector instances; - if (nstreams == 1) - instances = Kokkos::Experimental::partition_space(execution_space(), 1); - else if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + auto instances = Kokkos::Experimental::partition_space( + execution_space(), std::vector(nstreams, 1)); std::vector kh_v(nstreams); std::vector input_mat_v(nstreams); diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 08f41eefbb..2a8398ed46 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -139,8 +139,7 @@ struct SpilukTest { using range_policy = Kokkos::RangePolicy; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space>; + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using Crs = CrsMatrix; using Bsr = BsrMatrix; diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 1a4c78e08e..b8b35bc422 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -38,1320 +38,808 @@ using namespace KokkosKernels; using namespace KokkosKernels::Impl; using namespace KokkosKernels::Experimental; -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #endif -// #ifndef kokkos_complex_float -// #define kokkos_complex_float Kokkos::complex -// #endif - -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; namespace Test { -#if 0 -template -void run_test_sptrsv_mtx() { - - typedef typename KokkosSparse::CrsMatrix crsmat_t; - typedef typename crsmat_t::StaticCrsGraphType graph_t; - - //typedef Kokkos::View< size_type*, device > RowMapType; - //typedef Kokkos::View< lno_t*, device > EntriesType; - typedef Kokkos::View< scalar_t*, device > ValuesType; - - // Lower tri - std::cout << "LowerTriTest Begin" << std::endl; - { - -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-offshore-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Transport-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Fault_639amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-thermal2-amd.mtx"; - std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-dielFilterV2real-amd.mtx"; - std::cout << "Matrix file: " << mtx_filename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix - graph_t lgraph = triMtx.graph; // in_graph - - auto row_map = lgraph.row_map; - auto entries = lgraph.entries; - auto values = triMtx.values; - - const size_type nrows = lgraph.numRows(); -// const size_type nnz = triMtx.nnz(); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - - std::cout << "UnitTest nrows = " << nrows << std::endl; - - KernelHandle kh; - bool is_lower_tri = true; - std::cout << "Create handle" << std::endl; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - - std::cout << "Prepare linear system" << std::endl; - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - -// typedef CrsMatrix crsMat_t; -// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - - std::cout << "SPMV" << std::endl; - KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - - std::cout << "TriSolve Symbolic" << std::endl; - Kokkos::Timer timer; - sptrsv_symbolic( &kh, row_map, entries ); - std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl; +template +struct SptrsvTest { + // Define useful types + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - std::cout << "TriSolve Solve" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve TEAMPOLICY! Time: " << timer.seconds() << std::endl; + using Crs = CrsMatrix; + using Bsr = BsrMatrix; - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + using crs_graph_t = typename Crs::StaticCrsGraphType; + using range_policy_t = Kokkos::RangePolicy; - kh.destroy_sptrsv_handle(); + static std::vector> get_5x5_ut_ones_fixture() { + std::vector> A = {{1.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 1.00, 0.00, 0.00, 1.00}, + {0.00, 0.00, 1.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 1.00}}; + return A; } - // Upper tri - std::cout << "UpperTriTest Begin" << std::endl; - { -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-offshore-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Transport-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Fault_639amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-thermal2-amd.mtx"; - std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-dielFilterV2real-amd.mtx"; - std::cout << "Matrix file: " << mtx_filename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix - graph_t lgraph = triMtx.graph; // in_graph - - auto row_map = lgraph.row_map; - auto entries = lgraph.entries; - auto values = triMtx.values; - - const size_type nrows = lgraph.numRows(); -// const size_type nnz = triMtx.nnz(); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - - std::cout << "UnitTest nrows = " << nrows << std::endl; - - KernelHandle kh; - bool is_lower_tri = false; - std::cout << "Create handle" << std::endl; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - - std::cout << "Prepare linear system" << std::endl; - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - -// typedef CrsMatrix crsMat_t; -// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - std::cout << "SPMV" << std::endl; - KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - - std::cout << "TriSolve Symbolic" << std::endl; - Kokkos::Timer timer; - sptrsv_symbolic( &kh, row_map, entries ); - std::cout << "UTRI Symbolic Time: " << timer.seconds() << std::endl; - - std::cout << "TriSolve Solve" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHD_TP1 Time: " << timer.seconds() << std::endl; - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - kh.destroy_sptrsv_handle(); + static std::vector> get_5x5_ut_fixture() { + const auto KZ = KEEP_ZERO(); + std::vector> A = {{5.00, 1.00, 1.00, 0.00, KZ}, + {KZ, 5.00, KZ, 0.00, 1.00}, + {0.00, 0.00, 5.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 5.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 5.00}}; + return A; } -} -#endif - -namespace { -template -struct ReductionCheck { - using lno_t = OrdinalType; - using value_type = ValueType; - - ViewType lhs; + static std::vector> get_5x5_lt_fixture() { + const auto KZ = KEEP_ZERO(); + std::vector> A = {{5.00, KZ, 0.00, 0.00, 0.00}, + {2.00, 5.00, 0.00, 0.00, 0.00}, + {1.00, KZ, 5.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 5.00, 0.00}, + {KZ, 1.00, 1.00, 1.00, 5.00}}; + return A; + } - ReductionCheck(const ViewType &lhs_) : lhs(lhs_) {} + static std::vector> get_5x5_lt_ones_fixture() { + std::vector> A = {{1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 1.00, 0.00, 0.00, 0.00}, + {1.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 1.00, 0.00}, + {0.00, 1.00, 1.00, 1.00, 1.00}}; + return A; + } - KOKKOS_INLINE_FUNCTION - void operator()(lno_t i, value_type &tsum) const { tsum += lhs(i); } -}; -} // namespace + struct ReductionCheck { + ValuesType lhs; -template -void run_test_sptrsv() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; + ReductionCheck(const ValuesType &lhs_) : lhs(lhs_) {} - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); + KOKKOS_INLINE_FUNCTION + void operator()(lno_t i, scalar_t &tsum) const { tsum += lhs(i); } + }; - const size_type nrows = 5; - const size_type nnz = 10; + static void run_test_sptrsv() { + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space>; + const size_type nrows = 5; + const size_type nnz = 10; #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - using host_crsmat_t = typename KernelHandle::SPTRSVHandleType::host_crsmat_t; - using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; + using host_crsmat_t = + typename KernelHandle::SPTRSVHandleType::host_crsmat_t; + using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; - using row_map_view_t = typename host_graph_t::row_map_type::non_const_type; - using cols_view_t = typename host_graph_t::entries_type::non_const_type; - using values_view_t = typename host_crsmat_t::values_type::non_const_type; + using row_map_view_t = typename host_graph_t::row_map_type::non_const_type; + using cols_view_t = typename host_graph_t::entries_type::non_const_type; + using values_view_t = typename host_crsmat_t::values_type::non_const_type; - // L & U handle for supernodal SpTrsv - KernelHandle khL; - KernelHandle khU; + // L & U handle for supernodal SpTrsv + KernelHandle khL; + KernelHandle khU; - // right-hand-side and solution - ValuesType B("rhs", nrows); - ValuesType X("sol", nrows); + // right-hand-side and solution + ValuesType B("rhs", nrows); + ValuesType X("sol", nrows); - // host CRS for L & U - host_crsmat_t L, U, Ut; + // host CRS for L & U + host_crsmat_t L, U, Ut; #endif - // Upper tri - { - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - hrow_map(0) = 0; - hrow_map(1) = 2; - hrow_map(2) = 4; - hrow_map(3) = 7; - hrow_map(4) = 9; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 1; - hentries(3) = 4; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 4; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } - - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); + // Upper tri + { + RowMapType row_map; + EntriesType entries; + ValuesType values; - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + auto fixture = get_5x5_ut_ones_fixture(); - // Solution to find - ValuesType lhs("lhs", nrows); + compress_matrix(row_map, entries, values, fixture); - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); - typedef CrsMatrix crsMat_t; - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + // Solution to find + ValuesType lhs("lhs", nrows); - { - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << - "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - */ + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); - kh.destroy_sptrsv_handle(); - } + Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + { + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + // FIXME Issues with various integral type combos - algorithm currently + // unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0) ); + */ + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - kh.destroy_sptrsv_handle(); - } + { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, + is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); + } #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + if (std::is_same::value && + std::is_same::value && + std::is_same::value) { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } #endif #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - const scalar_t FIVE = scalar_t(5); - const size_type nnz_sp = 14; - { - // U in csr - row_map_view_t hUrowptr("hUrowptr", nrows + 1); - cols_view_t hUcolind("hUcolind", nnz_sp); - values_view_t hUvalues("hUvalues", nnz_sp); - - // rowptr - hUrowptr(0) = 0; - hUrowptr(1) = 4; - hUrowptr(2) = 8; - hUrowptr(3) = 11; - hUrowptr(4) = 13; - hUrowptr(5) = 14; - - // colind - // first row (first supernode) - hUcolind(0) = 0; - hUcolind(1) = 1; - hUcolind(2) = 2; - hUcolind(3) = 4; - // second row (first supernode) - hUcolind(4) = 0; - hUcolind(5) = 1; - hUcolind(6) = 2; - hUcolind(7) = 4; - // third row (second supernode) - hUcolind(8) = 2; - hUcolind(9) = 3; - hUcolind(10) = 4; - // fourth row (third supernode) - hUcolind(11) = 3; - hUcolind(12) = 4; - // fifth row (fourth supernode) - hUcolind(13) = 4; - - // values - // first row (first supernode) - hUvalues(0) = FIVE; - hUvalues(1) = ONE; - hUvalues(2) = ONE; - hUvalues(3) = ZERO; - // second row (first supernode) - hUvalues(4) = ZERO; - hUvalues(5) = FIVE; - hUvalues(6) = ZERO; - hUvalues(7) = ONE; - // third row (second supernode) - hUvalues(8) = FIVE; - hUvalues(9) = ONE; - hUvalues(10) = ONE; - // fourth row (third supernode) - hUvalues(11) = FIVE; - hUvalues(12) = ONE; - // fifth row (fourth supernode) - hUvalues(13) = FIVE; - - // save U for Supernodal Sptrsv - host_graph_t static_graph(hUcolind, hUrowptr); - U = host_crsmat_t("CrsMatrixU", nrows, hUvalues, static_graph); - - // create handle for Supernodal Sptrsv - bool is_lower_tri = false; - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - is_lower_tri); - - // X = U*ONES to generate B = A*ONES (on device) + const scalar_t FIVE = scalar_t(5); + const size_type nnz_sp = 14; { - RowMapType Urowptr("Urowptr", nrows + 1); - EntriesType Ucolind("Ucolind", nnz_sp); - ValuesType Uvalues("Uvalues", nnz_sp); - - Kokkos::deep_copy(Urowptr, hUrowptr); - Kokkos::deep_copy(Ucolind, hUcolind); - Kokkos::deep_copy(Uvalues, hUvalues); + // U in csr + auto ut_fixture = get_5x5_ut_fixture(); + row_map_view_t hUrowptr; + cols_view_t hUcolind; + values_view_t hUvalues; + + // first row -> first supernode + // second row -> first supernode + // third row -> second supernode + // fourth row -> third supernode + // fifth row -> fourth supernode + + compress_matrix(hUrowptr, hUcolind, hUvalues, ut_fixture); + + // save U for Supernodal Sptrsv + host_graph_t static_graph(hUcolind, hUrowptr); + U = host_crsmat_t("CrsMatrixU", nrows, hUvalues, static_graph); + + // create handle for Supernodal Sptrsv + bool is_lower_tri = false; + khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + is_lower_tri); + + // X = U*ONES to generate B = A*ONES (on device) + { + RowMapType Urowptr("Urowptr", nrows + 1); + EntriesType Ucolind("Ucolind", nnz_sp); + ValuesType Uvalues("Uvalues", nnz_sp); + + Kokkos::deep_copy(Urowptr, hUrowptr); + Kokkos::deep_copy(Ucolind, hUcolind); + Kokkos::deep_copy(Uvalues, hUvalues); + + Crs mtxU("mtxU", nrows, nrows, nnz_sp, Uvalues, Urowptr, Ucolind); + Kokkos::deep_copy(B, ONE); + KokkosSparse::spmv("N", ONE, mtxU, B, ZERO, X); + } + } - crsMat_t mtxU("mtxU", nrows, nrows, nnz_sp, Uvalues, Urowptr, Ucolind); - Kokkos::deep_copy(B, ONE); - KokkosSparse::spmv("N", ONE, mtxU, B, ZERO, X); + { + // U in csc (for inverting off-diag) + row_map_view_t hUcolptr("hUcolptr", nrows + 1); + cols_view_t hUrowind("hUrowind", nnz_sp); + values_view_t hUvalues("hUvalues", nnz_sp); + + // The unsorted ordering seems to matter here, so we cannot use our + // fixture tools. + + hUcolptr(0) = 0; + hUcolptr(1) = 2; + hUcolptr(2) = 4; + hUcolptr(3) = 7; + hUcolptr(4) = 9; + hUcolptr(5) = 14; + + // colind + // first column (first supernode) + hUrowind(0) = 0; + hUrowind(1) = 1; + // second column (first supernode) + hUrowind(2) = 0; + hUrowind(3) = 1; + // third column (second supernode) + hUrowind(4) = 2; + hUrowind(5) = 0; + hUrowind(6) = 1; + // fourth column (third supernode) + hUrowind(7) = 3; + hUrowind(8) = 2; + // fifth column (fourth supernode) + hUrowind(9) = 4; + hUrowind(10) = 0; + hUrowind(11) = 1; + hUrowind(12) = 2; + hUrowind(13) = 3; + + // values + // first column (first supernode) + hUvalues(0) = FIVE; + hUvalues(1) = ZERO; + // second column (first supernode) + hUvalues(2) = ONE; + hUvalues(3) = FIVE; + // third column (second supernode) + hUvalues(4) = FIVE; + hUvalues(5) = ONE; + hUvalues(6) = ZERO; + // fourth column (third supernode) + hUvalues(7) = FIVE; + hUvalues(8) = ONE; + // fifth column (fourth supernode) + hUvalues(9) = FIVE; + hUvalues(10) = ZERO; + hUvalues(11) = ONE; + hUvalues(12) = ONE; + hUvalues(13) = ONE; + + // store Ut in crsmat + host_graph_t static_graph(hUrowind, hUcolptr); + Ut = host_crsmat_t("CrsMatrixUt", nrows, hUvalues, static_graph); } +#endif } + // Lower tri { - // U in csc (for inverting off-diag) - row_map_view_t hUcolptr("hUcolptr", nrows + 1); - cols_view_t hUrowind("hUrowind", nnz_sp); - values_view_t hUvalues("hUvalues", nnz_sp); - - // colptr - hUcolptr(0) = 0; - hUcolptr(1) = 2; - hUcolptr(2) = 4; - hUcolptr(3) = 7; - hUcolptr(4) = 9; - hUcolptr(5) = 14; - - // colind - // first column (first supernode) - hUrowind(0) = 0; - hUrowind(1) = 1; - // second column (first supernode) - hUrowind(2) = 0; - hUrowind(3) = 1; - // third column (second supernode) - hUrowind(4) = 2; - hUrowind(5) = 0; - hUrowind(6) = 1; - // fourth column (third supernode) - hUrowind(7) = 3; - hUrowind(8) = 2; - // fifth column (fourth supernode) - hUrowind(9) = 4; - hUrowind(10) = 0; - hUrowind(11) = 1; - hUrowind(12) = 2; - hUrowind(13) = 3; - - // values - // first column (first supernode) - hUvalues(0) = FIVE; - hUvalues(1) = ZERO; - // second column (first supernode) - hUvalues(2) = ONE; - hUvalues(3) = FIVE; - // third column (second supernode) - hUvalues(4) = FIVE; - hUvalues(5) = ONE; - hUvalues(6) = ZERO; - // fourth column (third supernode) - hUvalues(7) = FIVE; - hUvalues(8) = ONE; - // fifth column (fourth supernode) - hUvalues(9) = FIVE; - hUvalues(10) = ZERO; - hUvalues(11) = ONE; - hUvalues(12) = ONE; - hUvalues(13) = ONE; - - // store Ut in crsmat - host_graph_t static_graph(hUrowind, hUcolptr); - Ut = host_crsmat_t("CrsMatrixUt", nrows, hUvalues, static_graph); - } -#endif - } - - // Lower tri - { - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - hrow_map(0) = 0; - hrow_map(1) = 1; - hrow_map(2) = 2; - hrow_map(3) = 4; - hrow_map(4) = 6; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 1; - hentries(2) = 0; - hentries(3) = 2; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 1; - hentries(7) = 2; - hentries(8) = 3; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } + auto fixture = get_5x5_lt_ones_fixture(); + RowMapType row_map; + EntriesType entries; + ValuesType values; - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); + compress_matrix(row_map, entries, values, fixture); - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); - // Solution to find - ValuesType lhs("lhs", nrows); + // Solution to find + ValuesType lhs("lhs", nrows); - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); - typedef CrsMatrix crsMat_t; - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - { - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << - "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + { + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + // FIXME Issues with various integral type combos - algorithm currently + // unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ( sum, lhs.extent(0) ); + */ + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - */ - kh.destroy_sptrsv_handle(); - } - - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, + is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + if (std::is_same::value && + std::is_same::value && + std::is_same::value) { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } #endif #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - { - // L in csc - const scalar_t TWO = scalar_t(2); - const scalar_t FIVE = scalar_t(5); - const size_type nnz_sp = 14; - - row_map_view_t hLcolptr("hUcolptr", nrows + 1); - cols_view_t hLrowind("hUrowind", nnz_sp); - values_view_t hLvalues("hUvalues", nnz_sp); - - // colptr - hLcolptr(0) = 0; - hLcolptr(1) = 4; - hLcolptr(2) = 8; - hLcolptr(3) = 11; - hLcolptr(4) = 13; - hLcolptr(5) = 14; - - // rowind - // first column (first supernode) - hLrowind(0) = 0; - hLrowind(1) = 1; - hLrowind(2) = 2; - hLrowind(3) = 4; - // second column (first supernode) - hLrowind(4) = 0; - hLrowind(5) = 1; - hLrowind(6) = 2; - hLrowind(7) = 4; - // third column (second supernode) - hLrowind(8) = 2; - hLrowind(9) = 3; - hLrowind(10) = 4; - // fourth column (third supernode) - hLrowind(11) = 3; - hLrowind(12) = 4; - // fifth column (fourth supernode) - hLrowind(13) = 4; - - // values - // first column (first supernode) - hLvalues(0) = FIVE; - hLvalues(1) = TWO; - hLvalues(2) = ONE; - hLvalues(3) = ZERO; - // second column (first supernode) - hLvalues(4) = ZERO; - hLvalues(5) = FIVE; - hLvalues(6) = ZERO; - hLvalues(7) = ONE; - // third column (second supernode) - hLvalues(8) = FIVE; - hLvalues(9) = ONE; - hLvalues(10) = ONE; - // fourth column (third supernode) - hLvalues(11) = FIVE; - hLvalues(12) = ONE; - // fifth column (fourth supernode) - hLvalues(13) = FIVE; - - // store Lt in crsmat - host_graph_t static_graph(hLrowind, hLcolptr); - L = host_crsmat_t("CrsMatrixL", nrows, hLvalues, static_graph); - - bool is_lower_tri = true; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - is_lower_tri); - - // generate B = A*ONES = L*(U*ONES), where X = U*ONES (on device) { - RowMapType Lcolptr("Lcolptr", nrows + 1); - EntriesType Lrowind("Lrowind", nnz_sp); - ValuesType Lvalues("Lvalues", nnz_sp); - - Kokkos::deep_copy(Lcolptr, hLcolptr); - Kokkos::deep_copy(Lrowind, hLrowind); - Kokkos::deep_copy(Lvalues, hLvalues); - - crsMat_t mtxL("mtxL", nrows, nrows, nnz_sp, Lvalues, Lcolptr, Lrowind); - KokkosSparse::spmv("T", ONE, mtxL, X, ZERO, B); + // L in csc + const size_type nnz_sp = 14; + + // first column (first supernode) + // second column (first supernode) + // third column (second supernode) + // fourth column (third supernode) + // fifth column (fourth supernode) + + auto lt_fixture = get_5x5_lt_fixture(); + row_map_view_t hLcolptr; + cols_view_t hLrowind; + values_view_t hLvalues; + compress_matrix(hLcolptr, hLrowind, hLvalues, lt_fixture); + + // store Lt in crsmat + host_graph_t static_graph(hLrowind, hLcolptr); + L = host_crsmat_t("CrsMatrixL", nrows, hLvalues, static_graph); + + bool is_lower_tri = true; + khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + is_lower_tri); + + // generate B = A*ONES = L*(U*ONES), where X = U*ONES (on device) + { + RowMapType Lcolptr("Lcolptr", nrows + 1); + EntriesType Lrowind("Lrowind", nnz_sp); + ValuesType Lvalues("Lvalues", nnz_sp); + + Kokkos::deep_copy(Lcolptr, hLcolptr); + Kokkos::deep_copy(Lrowind, hLrowind); + Kokkos::deep_copy(Lvalues, hLvalues); + + Crs mtxL("mtxL", nrows, nrows, nnz_sp, Lvalues, Lcolptr, Lrowind); + KokkosSparse::spmv("T", ONE, mtxL, X, ZERO, B); + } } - } - { - // unit-test for supernode SpTrsv (default) - // > set up supernodes (block size = one) - size_type nsupers = 4; - Kokkos::View supercols("supercols", - 1 + nsupers); - supercols(0) = 0; - supercols(1) = 2; // two columns - supercols(2) = 3; // one column - supercols(3) = 4; // one column - supercols(4) = 5; // one column - int *etree = NULL; // we generate graph internally - - // invert diagonal blocks - bool invert_diag = true; - khL.set_sptrsv_invert_diagonal(invert_diag); - khU.set_sptrsv_invert_diagonal(invert_diag); - - // > symbolic (on host) - sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, - &khL, U.graph, &khU); - // > numeric (on host) - sptrsv_compute(&khL, L); - sptrsv_compute(&khU, U); - Kokkos::fence(); - - // > solve - ValuesType b("b", nrows); - Kokkos::deep_copy(b, B); - Kokkos::deep_copy(X, ZERO); - sptrsv_solve(&khL, &khU, X, b); - Kokkos::fence(); - - // > check - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, X.extent(0)), - ReductionCheck(X), sum); - if (sum != lhs.extent(0)) { - std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." - << lhs.extent(0) << std::endl; - khL.get_sptrsv_handle()->print_algorithm(); - } else { - std::cout << "Supernode Tri Solve SUCCESS" << std::endl; - khL.get_sptrsv_handle()->print_algorithm(); + { + // unit-test for supernode SpTrsv (default) + // > set up supernodes (block size = one) + size_type nsupers = 4; + Kokkos::View supercols("supercols", + 1 + nsupers); + supercols(0) = 0; + supercols(1) = 2; // two columns + supercols(2) = 3; // one column + supercols(3) = 4; // one column + supercols(4) = 5; // one column + int *etree = NULL; // we generate graph internally + + // invert diagonal blocks + bool invert_diag = true; + khL.set_sptrsv_invert_diagonal(invert_diag); + khU.set_sptrsv_invert_diagonal(invert_diag); + + // > symbolic (on host) + sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, + &khL, U.graph, &khU); + // > numeric (on host) + sptrsv_compute(&khL, L); + sptrsv_compute(&khU, U); + Kokkos::fence(); + + // > solve + ValuesType b("b", nrows); + Kokkos::deep_copy(b, B); + Kokkos::deep_copy(X, ZERO); + sptrsv_solve(&khL, &khU, X, b); + Kokkos::fence(); + + // > check + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), + ReductionCheck(X), sum); + EXPECT_EQ(sum, lhs.extent(0)); + EXPECT_EQ(sum, X.extent(0)); + + khL.destroy_sptrsv_handle(); + khU.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(X.extent(0))); - khL.destroy_sptrsv_handle(); - khU.destroy_sptrsv_handle(); - } - - { - // unit-test for supernode SpTrsv (running TRMM on device for compute) - // > set up supernodes - size_type nsupers = 4; - Kokkos::View supercols("supercols", - 1 + nsupers); - supercols(0) = 0; - supercols(1) = 2; // two columns - supercols(2) = 3; // one column - supercols(3) = 4; // one column - supercols(4) = 5; // one column - int *etree = NULL; // we generate tree internally - - // > create handles - KernelHandle khLd; - KernelHandle khUd; - khLd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); - khUd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, false); - - // > invert diagonal blocks - bool invert_diag = true; - khLd.set_sptrsv_invert_diagonal(invert_diag); - khUd.set_sptrsv_invert_diagonal(invert_diag); - - // > invert off-diagonal blocks - bool invert_offdiag = true; - khUd.set_sptrsv_column_major(true); - khLd.set_sptrsv_invert_offdiagonal(invert_offdiag); - khUd.set_sptrsv_invert_offdiagonal(invert_offdiag); - - // > forcing sptrsv compute to perform TRMM on device - khLd.set_sptrsv_diag_supernode_sizes(1, 1); - khUd.set_sptrsv_diag_supernode_sizes(1, 1); - - // > symbolic (on host) - sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, - &khLd, Ut.graph, &khUd); - // > numeric (on host) - sptrsv_compute(&khLd, L); - sptrsv_compute(&khUd, Ut); - Kokkos::fence(); - - // > solve - ValuesType b("b", nrows); - Kokkos::deep_copy(b, B); - Kokkos::deep_copy(X, ZERO); - sptrsv_solve(&khLd, &khUd, X, b); - Kokkos::fence(); - - // > check - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, X.extent(0)), - ReductionCheck(X), sum); - if (sum != lhs.extent(0)) { - std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." - << lhs.extent(0) << std::endl; - khLd.get_sptrsv_handle()->print_algorithm(); - } else { - std::cout << "Supernode Tri Solve SUCCESS" << std::endl; - khLd.get_sptrsv_handle()->print_algorithm(); + { + // unit-test for supernode SpTrsv (running TRMM on device for compute) + // > set up supernodes + size_type nsupers = 4; + Kokkos::View supercols("supercols", + 1 + nsupers); + supercols(0) = 0; + supercols(1) = 2; // two columns + supercols(2) = 3; // one column + supercols(3) = 4; // one column + supercols(4) = 5; // one column + int *etree = NULL; // we generate tree internally + + // > create handles + KernelHandle khLd; + KernelHandle khUd; + khLd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); + khUd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + false); + + // > invert diagonal blocks + bool invert_diag = true; + khLd.set_sptrsv_invert_diagonal(invert_diag); + khUd.set_sptrsv_invert_diagonal(invert_diag); + + // > invert off-diagonal blocks + bool invert_offdiag = true; + khUd.set_sptrsv_column_major(true); + khLd.set_sptrsv_invert_offdiagonal(invert_offdiag); + khUd.set_sptrsv_invert_offdiagonal(invert_offdiag); + + // > forcing sptrsv compute to perform TRMM on device + khLd.set_sptrsv_diag_supernode_sizes(1, 1); + khUd.set_sptrsv_diag_supernode_sizes(1, 1); + + // > symbolic (on host) + sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, + &khLd, Ut.graph, &khUd); + // > numeric (on host) + sptrsv_compute(&khLd, L); + sptrsv_compute(&khUd, Ut); + Kokkos::fence(); + + // > solve + ValuesType b("b", nrows); + Kokkos::deep_copy(b, B); + Kokkos::deep_copy(X, ZERO); + sptrsv_solve(&khLd, &khUd, X, b); + Kokkos::fence(); + + // > check + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), + ReductionCheck(X), sum); + EXPECT_EQ(sum, lhs.extent(0)); + EXPECT_EQ(sum, X.extent(0)); + + khLd.destroy_sptrsv_handle(); + khUd.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(X.extent(0))); - - khLd.destroy_sptrsv_handle(); - khUd.destroy_sptrsv_handle(); - } #endif + } } -} - -template -void run_test_sptrsv_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - using crsMat_t = CrsMatrix; - // Workaround for OpenMP: skip tests if concurrency < nstreams because of - // not enough resource to partition - bool run_streams_test = true; + static void run_test_sptrsv_streams(int test_algo, int nstreams) { + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; #ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - int exec_concurrency = execution_space().concurrency(); - if (exec_concurrency < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: concurrency = " << exec_concurrency - << std::endl; + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } } - } #endif - if (!run_streams_test) return; - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - const size_type nrows = 5; - const size_type nnz = 10; - - std::vector instances; - if (nstreams == 1) - instances = Kokkos::Experimental::partition_space(execution_space(), 1); - else if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else // (nstreams == 4) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); - - std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); - std::vector row_map_v(nstreams); - std::vector entries_v(nstreams); - std::vector values_v(nstreams); - std::vector rhs_v(nstreams); - std::vector lhs_v(nstreams); - - RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); - EntriesType_hostmirror hentries("hentries", nnz); - ValuesType_hostmirror hvalues("hvalues", nnz); - - // Upper tri - { - hrow_map(0) = 0; - hrow_map(1) = 2; - hrow_map(2) = 4; - hrow_map(3) = 7; - hrow_map(4) = 9; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 1; - hentries(3) = 4; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 4; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } + if (!run_streams_test) return; - for (int i = 0; i < nstreams; i++) { - // Allocate U - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); + const size_type nrows = 5; + const size_type nnz = 10; - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + auto instances = Kokkos::Experimental::partition_space( + execution_space(), std::vector(nstreams, 1)); - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector row_map_v(nstreams); + std::vector entries_v(nstreams); + std::vector values_v(nstreams); + std::vector rhs_v(nstreams); + std::vector lhs_v(nstreams); - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = false; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy( - 0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - if (sum != lhs_v[i].extent(0)) { - std::cout << "Upper Tri Solve FAILURE on stream " << i << std::endl; - kh_v[i].get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + RowMapType_hostmirror hrow_map; + EntriesType_hostmirror hentries; + ValuesType_hostmirror hvalues; - kh_v[i].destroy_sptrsv_handle(); - } - } - - // Lower tri - { - hrow_map(0) = 0; - hrow_map(1) = 1; - hrow_map(2) = 2; - hrow_map(3) = 4; - hrow_map(4) = 6; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 1; - hentries(2) = 0; - hentries(3) = 2; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 1; - hentries(7) = 2; - hentries(8) = 3; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; + // Upper tri + { + auto fixture = get_5x5_ut_ones_fixture(); + compress_matrix(hrow_map, hentries, hvalues, fixture); + + for (int i = 0; i < nstreams; i++) { + // Allocate U + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = false; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + EXPECT_EQ(sum, lhs_v[i].extent(0)); + + kh_v[i].destroy_sptrsv_handle(); + } } - for (int i = 0; i < nstreams; i++) { - // Allocate L - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); - - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = true; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy( - 0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - if (sum != lhs_v[i].extent(0)) { - std::cout << "Lower Tri Solve FAILURE on stream " << i << std::endl; - kh_v[i].get_sptrsv_handle()->print_algorithm(); + // Lower tri + { + auto fixture = get_5x5_lt_ones_fixture(); + compress_matrix(hrow_map, hentries, hvalues, fixture); + + for (int i = 0; i < nstreams; i++) { + // Allocate L + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = true; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + EXPECT_EQ(sum, lhs_v[i].extent(0)); + + kh_v[i].destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); - - kh_v[i].destroy_sptrsv_handle(); } } -} +}; } // namespace Test template void test_sptrsv() { - Test::run_test_sptrsv(); - // Test::run_test_sptrsv_mtx(); + using TestStruct = Test::SptrsvTest; + TestStruct::run_test_sptrsv(); } template void test_sptrsv_streams() { - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; - Test::run_test_sptrsv_streams(0, 1); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 2); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 3); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 4); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; - Test::run_test_sptrsv_streams(1, 1); + using TestStruct = Test::SptrsvTest; - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 2); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 3); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 4); + TestStruct::run_test_sptrsv_streams(0, 1); + TestStruct::run_test_sptrsv_streams(0, 2); + TestStruct::run_test_sptrsv_streams(0, 3); + TestStruct::run_test_sptrsv_streams(0, 4); + TestStruct::run_test_sptrsv_streams(1, 1); + TestStruct::run_test_sptrsv_streams(1, 2); + TestStruct::run_test_sptrsv_streams(1, 3); + TestStruct::run_test_sptrsv_streams(1, 4); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) if (std::is_same::value && std::is_same::value) { - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 1 stream" << std::endl; - Test::run_test_sptrsv_streams(2, 1); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 2); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 3); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 4); + TestStruct::run_test_sptrsv_streams(2, 1); + TestStruct::run_test_sptrsv_streams(2, 2); + TestStruct::run_test_sptrsv_streams(2, 3); + TestStruct::run_test_sptrsv_streams(2, 4); } #endif } diff --git a/sparse/unit_test/Test_vector_fixtures.hpp b/sparse/unit_test/Test_vector_fixtures.hpp index b17197a3e5..2037a5485e 100644 --- a/sparse/unit_test/Test_vector_fixtures.hpp +++ b/sparse/unit_test/Test_vector_fixtures.hpp @@ -27,17 +27,23 @@ namespace Test { -template +template +scalar_t KEEP_ZERO() { + return scalar_t(-9999.0); +} + +template void compress_matrix( - RowMapT& row_map, EntriesT& entries, ValuesT& values, + MapT& map, EntriesT& entries, ValuesT& values, const std::vector>& fixture) { - using size_type = typename RowMapT::non_const_value_type; + using size_type = typename MapT::non_const_value_type; using scalar_t = typename ValuesT::non_const_value_type; const scalar_t ZERO = scalar_t(0); const size_type nrows = fixture.size(); + const size_type ncols = fixture[0].size(); // Count fixture nnz's size_type nnz = 0; @@ -50,35 +56,42 @@ void compress_matrix( } // Allocate device CRS views - Kokkos::resize(row_map, nrows + 1); + Kokkos::resize(map, (CSC ? ncols : nrows) + 1); Kokkos::resize(entries, nnz); Kokkos::resize(values, nnz); // Create host mirror views for CRS - auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hmap = Kokkos::create_mirror_view(map); auto hentries = Kokkos::create_mirror_view(entries); auto hvalues = Kokkos::create_mirror_view(values); // Compress into CRS (host views) size_type curr_nnz = 0; - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { - if (fixture[row_idx][col_idx] != ZERO) { - hentries(curr_nnz) = col_idx; - hvalues(curr_nnz) = fixture[row_idx][col_idx]; + + const size_type num_outer = (CSC ? ncols : nrows); + const size_type num_inner = (CSC ? nrows : ncols); + for (size_type outer_idx = 0; outer_idx < num_outer; ++outer_idx) { + for (size_type inner_idx = 0; inner_idx < num_inner; ++inner_idx) { + const size_type row = CSC ? inner_idx : outer_idx; + const size_type col = CSC ? outer_idx : inner_idx; + const auto val = fixture[row][col]; + if (val != ZERO) { + hentries(curr_nnz) = inner_idx; + hvalues(curr_nnz) = val == KEEP_ZERO() ? ZERO : val; ++curr_nnz; } - hrow_map(row_idx + 1) = curr_nnz; + hmap(outer_idx + 1) = curr_nnz; } } // Copy host CRS views to device CRS views - Kokkos::deep_copy(row_map, hrow_map); + Kokkos::deep_copy(map, hmap); Kokkos::deep_copy(entries, hentries); Kokkos::deep_copy(values, hvalues); } -template +template std::vector> decompress_matrix(const RowMapT& row_map, const EntriesT& entries, const ValuesT& values) { @@ -105,9 +118,13 @@ decompress_matrix(const RowMapT& row_map, const EntriesT& entries, const size_type row_nnz_begin = hrow_map(row_idx); const size_type row_nnz_end = hrow_map(row_idx + 1); for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { - const auto col_idx = hentries(row_nnz); - const scalar_t value = hvalues(row_nnz); - result[row_idx][col_idx] = value; + const auto col_idx = hentries(row_nnz); + const scalar_t value = hvalues(row_nnz); + if (CSC) { + result[col_idx][row_idx] = value; + } else { + result[row_idx][col_idx] = value; + } } } From 934cd7da77960292c3e50d1d10d9fec5c1b943bf Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 26 Feb 2024 10:54:38 -0700 Subject: [PATCH 173/326] CMake: error out in certain case (#2115) Graph unit tests are unique in that they use default_scalar for the KokkosKernelsHandle. So if test-eti-only is ON, but neither float nor double is instatiated, then error out for the graph unit tests. Users can still build without float or double if they want, but only if they turn off tests or the graph component. --- graph/unit_test/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/graph/unit_test/CMakeLists.txt b/graph/unit_test/CMakeLists.txt index 63539d9776..b497953159 100644 --- a/graph/unit_test/CMakeLists.txt +++ b/graph/unit_test/CMakeLists.txt @@ -10,6 +10,12 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_C # # ##################### +IF (KokkosKernels_TEST_ETI_ONLY) + IF (NOT KokkosKernels_INST_DOUBLE AND NOT KokkosKernels_INST_FLOAT) + MESSAGE(FATAL_ERROR "Because only ETI'd type combinations are enabled for testing, the Kokkos Kernels graph tests require that double or float is enabled in ETI.") + ENDIF () +ENDIF () + ##################### # # # Add GPU backends # From 63cd89e2d5d83d357e0e9887f8055cdd0f3074e6 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 28 Feb 2024 17:14:51 -0700 Subject: [PATCH 174/326] Wiki examples for BLAS2 functions are added (#2122) Some small additional change the the function headers themselves to add some missing header file inclusions. Applying clang-format Removing constexpr since it won't happen before some work in Core. --- blas/src/KokkosBlas2_ger.hpp | 11 ++-------- blas/src/KokkosBlas2_syr.hpp | 7 ++----- blas/src/KokkosBlas2_syr2.hpp | 2 ++ example/wiki/CMakeLists.txt | 1 + example/wiki/blas/CMakeLists.txt | 19 +++++++++++++++++ example/wiki/blas/KokkosBlas2_wiki_ger.cpp | 23 +++++++++++++++++++++ example/wiki/blas/KokkosBlas2_wiki_syr.cpp | 20 ++++++++++++++++++ example/wiki/blas/KokkosBlas2_wiki_syr2.cpp | 22 ++++++++++++++++++++ 8 files changed, 91 insertions(+), 14 deletions(-) create mode 100644 example/wiki/blas/CMakeLists.txt create mode 100644 example/wiki/blas/KokkosBlas2_wiki_ger.cpp create mode 100644 example/wiki/blas/KokkosBlas2_wiki_syr.cpp create mode 100644 example/wiki/blas/KokkosBlas2_wiki_syr2.cpp diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp index fbfc9c1f98..8650577faf 100644 --- a/blas/src/KokkosBlas2_ger.hpp +++ b/blas/src/KokkosBlas2_ger.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSBLAS2_GER_HPP_ #define KOKKOSBLAS2_GER_HPP_ +#include "KokkosKernels_helpers.hpp" + #include namespace KokkosBlas { @@ -42,15 +44,6 @@ template ::assignable, - "AViewType memory space must be assignable from XViewType"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "AViewType memory space must be assignable from YViewType"); - static_assert( Kokkos::SpaceAccessibility::accessible, diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index af66767ab4..00d1d8b3de 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSBLAS2_SYR_HPP_ #define KOKKOSBLAS2_SYR_HPP_ +#include "KokkosKernels_helpers.hpp" + #include namespace KokkosBlas { @@ -64,11 +66,6 @@ template void syr(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::assignable, - "AViewType memory space must be assignable from XViewType"); - static_assert( Kokkos::SpaceAccessibility::accessible, diff --git a/blas/src/KokkosBlas2_syr2.hpp b/blas/src/KokkosBlas2_syr2.hpp index c9a2f7b2c5..d86abd31c1 100644 --- a/blas/src/KokkosBlas2_syr2.hpp +++ b/blas/src/KokkosBlas2_syr2.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSBLAS2_SYR2_HPP_ #define KOKKOSBLAS2_SYR2_HPP_ +#include "KokkosKernels_helpers.hpp" + #include #include diff --git a/example/wiki/CMakeLists.txt b/example/wiki/CMakeLists.txt index 11c6e0d97d..1e751f5797 100644 --- a/example/wiki/CMakeLists.txt +++ b/example/wiki/CMakeLists.txt @@ -1,2 +1,3 @@ +ADD_SUBDIRECTORY(blas) ADD_SUBDIRECTORY(sparse) ADD_SUBDIRECTORY(graph) diff --git a/example/wiki/blas/CMakeLists.txt b/example/wiki/blas/CMakeLists.txt new file mode 100644 index 0000000000..245957bc89 --- /dev/null +++ b/example/wiki/blas/CMakeLists.txt @@ -0,0 +1,19 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../../../../test_common) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_blas2_ger + SOURCES KokkosBlas2_wiki_ger.cpp + ) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_blas2_syr + SOURCES KokkosBlas2_wiki_syr.cpp + ) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_blas2_syr2 + SOURCES KokkosBlas2_wiki_syr2.cpp + ) diff --git a/example/wiki/blas/KokkosBlas2_wiki_ger.cpp b/example/wiki/blas/KokkosBlas2_wiki_ger.cpp new file mode 100644 index 0000000000..89eaaf9292 --- /dev/null +++ b/example/wiki/blas/KokkosBlas2_wiki_ger.cpp @@ -0,0 +1,23 @@ +#include +#include + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + constexpr int M = 5; + constexpr int N = 4; + + Kokkos::View A("A", M, N); + Kokkos::View x("X", M); + Kokkos::View y("Y", N); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(x, 3.0); + Kokkos::deep_copy(y, 1.3); + + const double alpha = Kokkos::ArithTraits::one(); + + KokkosBlas::ger("T", alpha, x, y, A); + } + Kokkos::finalize(); +} diff --git a/example/wiki/blas/KokkosBlas2_wiki_syr.cpp b/example/wiki/blas/KokkosBlas2_wiki_syr.cpp new file mode 100644 index 0000000000..26c6a489b8 --- /dev/null +++ b/example/wiki/blas/KokkosBlas2_wiki_syr.cpp @@ -0,0 +1,20 @@ +#include +#include + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + constexpr int M = 5; + + Kokkos::View A("A", M, M); + Kokkos::View x("X", M); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(x, 3.0); + + const double alpha = double(1.0); + + KokkosBlas::syr("T", "U", alpha, x, A); + } + Kokkos::finalize(); +} diff --git a/example/wiki/blas/KokkosBlas2_wiki_syr2.cpp b/example/wiki/blas/KokkosBlas2_wiki_syr2.cpp new file mode 100644 index 0000000000..c1c8e5d0d1 --- /dev/null +++ b/example/wiki/blas/KokkosBlas2_wiki_syr2.cpp @@ -0,0 +1,22 @@ +#include +#include + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + constexpr int M = 5; + + Kokkos::View A("A", M, M); + Kokkos::View x("X", M); + Kokkos::View y("Y", M); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(x, 3.0); + Kokkos::deep_copy(y, 1.3); + + const double alpha = double(1.0); + + KokkosBlas::syr2("T", "U", alpha, x, y, A); + } + Kokkos::finalize(); +} From 4f2a095fa1a952d3868565856f8241bf6fe161ba Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 29 Feb 2024 13:19:04 -0700 Subject: [PATCH 175/326] Increase tolerance on gesv test (Fix #2123) (#2124) And uncomment the verbose output for when tolerance is exceeded, since that helps debug this sort of issue. This is only printed at most once so it won't spam the output if the entire vector is wrong. --- lapack/unit_test/Test_Lapack_gesv.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 31bc0e708b..77774d1d3f 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -130,17 +130,17 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Checking vs ref on CPU, this eps is about 10^-9 typedef typename ats::mag_type mag_type; - const mag_type eps = 2.0e7 * ats::epsilon(); + const mag_type eps = 3.0e7 * ats::epsilon(); bool test_flag = true; for (int i = 0; i < N; i++) { if (ats::abs(h_B(i) - h_X0(i)) > eps) { test_flag = false; - // printf( - // " Error %d, pivot %c, padding %c: result( %.15lf ) !=" - // "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", - // N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), - // int(i), ats::abs(h_B(i) - h_X0(i)), eps); - // break; + printf( + " Error %d, pivot %c, padding %c: result( %.15lf ) !=" + "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", + N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), + ats::abs(h_B(i) - h_X0(i)), eps); + break; } } ASSERT_EQ(test_flag, true); From 80b1a1877bea269e2de6027f7447111d0e64d58a Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 4 Mar 2024 11:39:30 -0700 Subject: [PATCH 176/326] Spmv handle (#2126) * spmv handle, TPL reuse * using handle in unification layer and hooking up new algorithm enums with old Controls options * Update spmv_merge perf test Compare KK merge vs. default and KK native * Small changes to help text of spmv_merge perf test * Complete backwards compatibility with Controls interface - copy over spmv algorithm selection correctly - copy expert tuning parameters * Controls spmv: accept other name for bsr algo * bsr spmv test: disable tensor core It was not actually being run before due to a different name actually enabling it (experimental_bsr_tc rather than experimental_tc) * Disable OneMKL spmv for complex types oneapi 2023.2 throws error saying complex isn't supported * OneMKL: call optimize_gemv during setup --- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 31 +- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 40 +- .../KokkosSparse_spmv_bsr_benchmark.cpp | 16 +- perf_test/sparse/KokkosSparse_spmv_merge.cpp | 149 +- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 200 +- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 165 +- sparse/impl/KokkosSparse_spmv_impl.hpp | 91 +- sparse/impl/KokkosSparse_spmv_spec.hpp | 161 +- sparse/src/KokkosSparse_BsrMatrix.hpp | 4 + sparse/src/KokkosSparse_CrsMatrix.hpp | 4 + sparse/src/KokkosSparse_Utils_mkl.hpp | 59 +- sparse/src/KokkosSparse_spmv.hpp | 1616 ++++------------- sparse/src/KokkosSparse_spmv_deprecated.hpp | 299 +++ sparse/src/KokkosSparse_spmv_handle.hpp | 393 ++++ ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 82 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 1133 ++++++------ .../KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 5 +- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 85 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 86 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 897 ++++----- sparse/unit_test/Test_Sparse_spmv.hpp | 239 +-- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 121 +- 22 files changed, 2855 insertions(+), 3021 deletions(-) create mode 100644 sparse/src/KokkosSparse_spmv_deprecated.hpp create mode 100644 sparse/src/KokkosSparse_spmv_handle.hpp diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 3f4893363a..1024356f7b 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -38,11 +38,11 @@ typedef default_size_type Offset; template void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, int num_vecs, char mode, Scalar beta) { - typedef KokkosSparse::CrsMatrix - matrix_type; - typedef typename Kokkos::View mv_type; - typedef typename mv_type::HostMirror h_mv_type; + using matrix_type = + KokkosSparse::CrsMatrix; + using mv_type = Kokkos::View; + using h_mv_type = typename mv_type::HostMirror; srand(17312837); matrix_type A; @@ -67,6 +67,7 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, mv_type x("X", numCols, num_vecs); mv_type y("Y", numRows, num_vecs); + h_mv_type h_x = Kokkos::create_mirror_view(x); h_mv_type h_y = Kokkos::create_mirror_view(y); h_mv_type h_y_compare = Kokkos::create_mirror(y); @@ -82,14 +83,24 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, // Benchmark auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0); auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0); - // Do 5 warm up calls (not timed) + + // Create handles for both rank-1 and rank-2 cases, + // even though only 1 will get used below (depending on num_vecs) + + KokkosSparse::SPMVHandle + handle_rank1; + KokkosSparse::SPMVHandle + handle_rank2; + // Do 5 warm up calls (not timed). This will also initialize the handle. for (int i = 0; i < 5; i++) { if (num_vecs == 1) { // run the rank-1 version - KokkosSparse::spmv(&mode, 1.0, A, x0, beta, y0); + KokkosSparse::spmv(&handle_rank1, &mode, 1.0, A, x0, beta, y0); } else { // rank-2 - KokkosSparse::spmv(&mode, 1.0, A, x, beta, y); + KokkosSparse::spmv(&handle_rank2, &mode, 1.0, A, x, beta, y); } Kokkos::DefaultExecutionSpace().fence(); } @@ -97,10 +108,10 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, for (int i = 0; i < loop; i++) { if (num_vecs == 1) { // run the rank-1 version - KokkosSparse::spmv(&mode, 1.0, A, x0, beta, y0); + KokkosSparse::spmv(&handle_rank1, &mode, 1.0, A, x0, beta, y0); } else { // rank-2 - KokkosSparse::spmv(&mode, 1.0, A, x, beta, y); + KokkosSparse::spmv(&handle_rank2, &mode, 1.0, A, x, beta, y); } Kokkos::DefaultExecutionSpace().fence(); } diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp index d3b038f0e4..d96a3c6c8d 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -159,18 +159,22 @@ int test_bsr_matrix_single_vec( y_vector_type ycrs("crs_product_result", nRow); auto h_ycrs = Kokkos::create_mirror_view(ycrs); - KokkosKernels::Experimental::Controls controls; + KokkosSparse::SPMVAlgorithm algo = KokkosSparse::SPMV_DEFAULT; + switch (static_cast(test)) { case Implementation::KokkosKernels: { - controls.setParameter("algorithm", "native"); + algo = KokkosSparse::SPMV_NATIVE; } break; default: break; } + KokkosSparse::SPMVHandle + handle_crs(algo); // Do the multiplication for warming up for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); Kokkos::deep_copy(ycrs, h_ycrs); - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); // Time a series of multiplications with the CrsMatrix double time_crs = 0.0; @@ -178,7 +182,7 @@ int test_bsr_matrix_single_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); time_crs += timer.seconds(); Kokkos::fence(); } @@ -192,10 +196,14 @@ int test_bsr_matrix_single_vec( scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int> Absr(Acrs, blockSize); + KokkosSparse::SPMVHandle + handle_bsr(algo); + // Do the multiplication for warming up for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); Kokkos::deep_copy(ybsr, h_ybsr); - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); // Time a series of multiplications with the BsrMatrix double time_bsr = 0.0; @@ -203,7 +211,7 @@ int test_bsr_matrix_single_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); Kokkos::deep_copy(ybsr, h_ybsr); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); time_bsr += timer.seconds(); Kokkos::fence(); } @@ -316,19 +324,23 @@ int test_bsr_matrix_vec( block_vector_t ycrs("crs_product_result", nRow, nvec); auto h_ycrs = Kokkos::create_mirror_view(ycrs); - KokkosKernels::Experimental::Controls controls; + KokkosSparse::SPMVAlgorithm algo = KokkosSparse::SPMV_DEFAULT; + switch (static_cast(test)) { case Implementation::KokkosKernels: { - controls.setParameter("algorithm", "native"); + algo = KokkosSparse::SPMV_NATIVE; } break; default: break; } + KokkosSparse::SPMVHandle + handle_crs(algo); // Do the multiplication for warming up for (Ordinal jc = 0; jc < nvec; ++jc) for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ycrs, h_ycrs); - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); // Time a series of multiplications with the CrsMatrix format double time_crs = 0.0; @@ -337,7 +349,7 @@ int test_bsr_matrix_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); time_crs += timer.seconds(); Kokkos::fence(); } @@ -347,6 +359,10 @@ int test_bsr_matrix_vec( scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int> Absr(Acrs, blockSize); + KokkosSparse::SPMVHandle + handle_bsr(algo); + block_vector_t ybsr("bsr_product_result", nRow, nvec); auto h_ybsr = Kokkos::create_mirror_view(ybsr); @@ -354,7 +370,7 @@ int test_bsr_matrix_vec( for (Ordinal jc = 0; jc < nvec; ++jc) for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ybsr, h_ybsr); - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); // Time a series of multiplications with the BsrMatrix double time_bsr = 0.0; @@ -363,7 +379,7 @@ int test_bsr_matrix_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ybsr, h_ybsr); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); time_bsr += timer.seconds(); Kokkos::fence(); } diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp index 770b09cfb1..254a35c34f 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp @@ -207,9 +207,10 @@ struct SpmvNative { typename YView> static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, const XView &x, const Beta &beta, const YView &y) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native"); - return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + KokkosSparse::SPMVHandle + handle(KokkosSparse::SPMV_NATIVE); + return KokkosSparse::spmv(&handle, mode, alpha, crs, x, beta, y); } static std::string name() { return "native"; } @@ -221,9 +222,10 @@ struct SpmvV41 { typename YView> static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, const XView &x, const Beta &beta, const YView &y) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "v4.1"); - return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + KokkosSparse::SPMVHandle + handle(KokkosSparse::SPMV_BSR_V41); + return KokkosSparse::spmv(&handle, mode, alpha, crs, x, beta, y); } static std::string name() { return "v4.1"; } @@ -473,4 +475,4 @@ int main(int argc, char **argv) { drop_cache(); Kokkos::finalize(); return 0; -} \ No newline at end of file +} diff --git a/perf_test/sparse/KokkosSparse_spmv_merge.cpp b/perf_test/sparse/KokkosSparse_spmv_merge.cpp index 6ad772116e..fdd2905b52 100644 --- a/perf_test/sparse/KokkosSparse_spmv_merge.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_merge.cpp @@ -148,9 +148,8 @@ matrix_type generate_unbalanced_matrix( void print_help() { printf("SPMV merge benchmark code written by Luc Berger-Vergiat.\n"); - printf( - "The goal is to test cusSPARSE's merge algorithm on imbalanced " - "matrices."); + printf("The goal is to compare the merge path algorithm vs.\n"); + printf("TPLs and the KK native algorithm on imbalanced matrices.\n"); printf("Options:\n"); printf( " --compare : Compare the performance of the merge algo with the " @@ -233,35 +232,59 @@ int main(int argc, char** argv) { Kokkos::initialize(argc, argv); { - if (std::is_same::value) { - // Note that we template the matrix with entries=lno_t and offsets=lno_t - // to make sure it verifies the cusparse requirements - using matrix_type = - KokkosSparse::CrsMatrix; - using values_type = typename matrix_type::values_type::non_const_type; - const Scalar SC_ONE = Kokkos::ArithTraits::one(); - const Scalar alpha = SC_ONE + SC_ONE; - const Scalar beta = alpha + SC_ONE; - - matrix_type test_matrix = generate_unbalanced_matrix( - numRows, numEntries, numLongRows, numLongEntries); - - values_type y("right hand side", test_matrix.numRows()); - values_type x("left hand side", test_matrix.numCols()); - Kokkos::deep_copy(x, SC_ONE); - Kokkos::deep_copy(y, SC_ONE); - - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "merge"); - - // Perform a so called "warm-up" run - KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - - double min_time = 1.0e32, max_time = 0.0, avg_time = 0.0; + // Note that we template the matrix with entries=lno_t and offsets=lno_t + // so that TPLs can be used + using matrix_type = + KokkosSparse::CrsMatrix; + using values_type = typename matrix_type::values_type::non_const_type; + using handle_type = + KokkosSparse::SPMVHandle; + const Scalar SC_ONE = Kokkos::ArithTraits::one(); + const Scalar alpha = SC_ONE + SC_ONE; + const Scalar beta = alpha + SC_ONE; + + matrix_type test_matrix = generate_unbalanced_matrix( + numRows, numEntries, numLongRows, numLongEntries); + + values_type y("right hand side", test_matrix.numRows()); + values_type x("left hand side", test_matrix.numCols()); + Kokkos::deep_copy(x, SC_ONE); + Kokkos::deep_copy(y, SC_ONE); + + handle_type handleMerge(KokkosSparse::SPMV_MERGE_PATH); + + // Perform a so called "warm-up" run + KokkosSparse::spmv(&handleMerge, "N", alpha, test_matrix, x, beta, y); + + double min_time = 1.0e32, max_time = 0.0, avg_time = 0.0; + for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { + Kokkos::Timer timer; + KokkosSparse::spmv(&handleMerge, "N", alpha, test_matrix, x, beta, y); + Kokkos::fence(); + double time = timer.seconds(); + avg_time += time; + if (time > max_time) max_time = time; + if (time < min_time) min_time = time; + } + + std::cout << "KK Merge alg --- min: " << min_time << " max: " << max_time + << " avg: " << avg_time / loop << std::endl; + + // Run the cusparse default algorithm and native kokkos-kernels algorithm + // then output timings for comparison + if (compare) { + handle_type handleDefault; + // Warm up + KokkosSparse::spmv(&handleDefault, "N", alpha, test_matrix, x, beta, y); + + min_time = 1.0e32; + max_time = 0.0; + avg_time = 0.0; for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { Kokkos::Timer timer; - KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); + KokkosSparse::spmv(&handleDefault, "N", alpha, test_matrix, x, beta, y); Kokkos::fence(); double time = timer.seconds(); avg_time += time; @@ -269,58 +292,28 @@ int main(int argc, char** argv) { if (time < min_time) min_time = time; } - std::cout << "cuSPARSE Merge alg --- min: " << min_time + std::cout << "Default alg --- min: " << min_time << " max: " << max_time << " avg: " << avg_time / loop << std::endl; - // Run the cusparse default algorithm and native kokkos-kernels algorithm - // then output timings for comparison - if (compare) { - controls.setParameter("algorithm", "default"); - - min_time = 1.0e32; - max_time = 0.0; - avg_time = 0.0; - for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { - Kokkos::Timer timer; - KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - Kokkos::fence(); - double time = timer.seconds(); - avg_time += time; - if (time > max_time) max_time = time; - if (time < min_time) min_time = time; - } - - std::cout << "cuSPARSE Default alg --- min: " << min_time - << " max: " << max_time << " avg: " << avg_time / loop - << std::endl; - - controls.setParameter("algorithm", "native"); - - min_time = 1.0e32; - max_time = 0.0; - avg_time = 0.0; - for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { - Kokkos::Timer timer; - // KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - KokkosSparse::Impl::spmv_beta(Kokkos::DefaultExecutionSpace{}, - controls, "N", alpha, test_matrix, x, - beta, y); - Kokkos::fence(); - double time = timer.seconds(); - avg_time += time; - if (time > max_time) max_time = time; - if (time < min_time) min_time = time; - } - - std::cout << "Kokkos Native alg --- min: " << min_time - << " max: " << max_time << " avg: " << avg_time / loop - << std::endl; + handle_type handleNative(KokkosSparse::SPMV_NATIVE); + KokkosSparse::spmv(&handleNative, "N", alpha, test_matrix, x, beta, y); + + min_time = 1.0e32; + max_time = 0.0; + avg_time = 0.0; + for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { + Kokkos::Timer timer; + KokkosSparse::spmv(&handleNative, "N", alpha, test_matrix, x, beta, y); + Kokkos::fence(); + double time = timer.seconds(); + avg_time += time; + if (time > max_time) max_time = time; + if (time < min_time) min_time = time; } - } else { - std::cout << "The default execution space is not Cuda, nothing to do!" + + std::cout << "KK Native alg --- min: " << min_time + << " max: " << max_time << " avg: " << avg_time / loop << std::endl; } } diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 06fe6f094d..1c2e4f80e9 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -677,13 +677,12 @@ struct BSR_GEMV_Functor { // spMatVec_no_transpose: version for CPU execution spaces // (RangePolicy or trivial serial impl used) // -template ()>::type * = nullptr> void spMatVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -704,15 +703,8 @@ void spMatVec_no_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMV_Functor func( alpha, A, x, beta, y, A.blockDim(), useConjugate); @@ -738,13 +730,12 @@ void spMatVec_no_transpose( // // spMatVec_no_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> void spMatVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -758,15 +749,9 @@ void spMatVec_no_transpose( AMatrix_Internal; typedef typename AMatrix_Internal::execution_space execution_space; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = -1; int vector_length = -1; const auto block_dim = A.blockDim(); @@ -788,14 +773,10 @@ void spMatVec_no_transpose( int64_t worksets = A.numRows(); // - // Use the controls to allow the user to pass in some tuning parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMV_Functor func( alpha, A, x, beta, y, block_dim, useConjugate); @@ -990,13 +971,12 @@ struct BSR_GEMV_Transpose_Functor { /// \brief spMatVec_transpose: version for CPU execution spaces (RangePolicy or /// trivial serial impl used) -template ()>::type * = nullptr> void spMatVec_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -1019,15 +999,8 @@ void spMatVec_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMV_Transpose_Functor func( alpha, A, x, y, useConjugate); @@ -1051,15 +1024,14 @@ void spMatVec_transpose( // // spMatVec_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> void spMatVec_transpose(const typename AMatrix::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, - const AlphaType &alpha, const AMatrix &A, - const XVector &x, const BetaType &beta, YVector &y, - bool useConjugate) { + Handle *handle, const AlphaType &alpha, + const AMatrix &A, const XVector &x, + const BetaType &beta, YVector &y, bool useConjugate) { if (A.numRows() <= 0) { return; } @@ -1073,17 +1045,10 @@ void spMatVec_transpose(const typename AMatrix::execution_space &exec, else if (beta != Kokkos::ArithTraits::one()) KokkosBlas::scal(exec, y, beta, y); - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } - int team_size = -1; - int vector_length = -1; + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = -1; + int vector_length = -1; int64_t worksets = A.numRows(); @@ -1104,14 +1069,10 @@ void spMatVec_transpose(const typename AMatrix::execution_space &exec, } // - // Use the controls to allow the user to pass in some tuning parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMV_Transpose_Functor func(alpha, A, x, y, useConjugate); @@ -1319,13 +1280,12 @@ struct BSR_GEMM_Functor { // spMatMultiVec_no_transpose: version for CPU execution spaces // (RangePolicy or trivial serial impl used) // -template ()>::type * = nullptr> void spMatMultiVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -1344,15 +1304,8 @@ void spMatMultiVec_no_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMM_Functor func(alpha, A, x, beta, y, useConjugate); @@ -1379,13 +1332,12 @@ void spMatMultiVec_no_transpose( // spMatMultiVec_no_transpose: version for GPU execution spaces (TeamPolicy // used) // -template ()>::type * = nullptr> void spMatMultiVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -1399,15 +1351,10 @@ void spMatMultiVec_no_transpose( AMatrix_Internal; typedef typename AMatrix_Internal::execution_space execution_space; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = + handle->force_dynamic_schedule; // Forces the use of a dynamic schedule + bool use_static_schedule = + handle->force_static_schedule; // Forces the use of a static schedule int team_size = -1; int vector_length = -1; @@ -1429,14 +1376,10 @@ void spMatMultiVec_no_transpose( } // - // Use the controls to allow the user to pass in some tuning parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMM_Functor func(alpha, A, x, beta, y, useConjugate); @@ -1649,14 +1592,13 @@ struct BSR_GEMM_Transpose_Functor { /// \brief spMatMultiVec_transpose: version for CPU execution spaces /// (RangePolicy or trivial serial impl used) -template ()>::type * = nullptr> void spMatMultiVec_transpose( - const execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, - const AlphaType &alpha, + const execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { @@ -1674,15 +1616,8 @@ void spMatMultiVec_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMM_Transpose_Functor @@ -1705,15 +1640,14 @@ void spMatMultiVec_transpose( // // spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> -void spMatMultiVec_transpose( - const execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, - const AlphaType &alpha, const AMatrix &A, const XVector &x, - const BetaType &beta, YVector &y, bool useConjugate) { +void spMatMultiVec_transpose(const execution_space &exec, Handle *handle, + const AlphaType &alpha, const AMatrix &A, + const XVector &x, const BetaType &beta, YVector &y, + bool useConjugate) { if (A.numRows() <= 0) { return; } @@ -1723,18 +1657,11 @@ void spMatMultiVec_transpose( else if (beta != Kokkos::ArithTraits::one()) KokkosBlas::scal(exec, y, beta, y); - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } - int team_size = -1; - int vector_length = -1; - int64_t worksets = A.numRows(); + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = -1; + int vector_length = -1; + int64_t worksets = A.numRows(); const auto block_dim = A.blockDim(); if (block_dim <= 4) { @@ -1752,15 +1679,10 @@ void spMatMultiVec_transpose( } // - // Use the controls to allow the user to pass in some tuning - // parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMM_Transpose_Functor func( alpha, A, x, y, useConjugate); diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 564100879e..cde7fc1461 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -21,7 +21,7 @@ #include #include "KokkosSparse_BsrMatrix.hpp" -#include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_handle.hpp" #include "KokkosKernels_Error.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include @@ -33,12 +33,14 @@ namespace Experimental { namespace Impl { // default is no eti available -template +template struct spmv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; -template > struct spmv_mv_bsrmatrix_eti_spec_avail { @@ -55,6 +57,9 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { template <> \ struct spmv_bsrmatrix_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -75,6 +80,9 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { template <> \ struct spmv_mv_bsrmatrix_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -99,76 +107,71 @@ namespace Experimental { namespace Impl { // declaration -template ::value, + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_bsrmatrix_eti_spec_avail< - ExecutionSpace, AMatrix, XVector, YVector>::value> + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &x, - const YScalar &beta, const YVector &y); + static void spmv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &x, + const YScalar &beta, const YVector &y); }; // declaration -template , bool tpl_spec_avail = spmv_mv_bsrmatrix_tpl_spec_avail< - ExecutionSpace, AMatrix, XVector, YVector>::value, + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_mv_bsrmatrix_eti_spec_avail< - ExecutionSpace, AMatrix, XVector, YVector>::value> + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_mv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &x, - const YScalar &beta, const YVector &y); + static void spmv_mv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &x, + const YScalar &beta, const YVector &y); }; // actual implementations to be compiled #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -// these should all be different -constexpr inline const char *ALG_V41 = "v4.1"; -constexpr inline const char *ALG_V42 = "v4.2"; -constexpr inline const char *ALG_TC = "experimental_bsr_tc"; - -template -struct SPMV_BSRMATRIX +struct SPMV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &X, - const YScalar &beta, const YVector &Y) { + static void spmv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { const bool modeIsNoTrans = (mode[0] == NoTranspose[0]); const bool modeIsConjugate = (mode[0] == Conjugate[0]); const bool modeIsConjugateTrans = (mode[0] == ConjugateTranspose[0]); const bool modeIsTrans = (mode[0] == Transpose[0]); // use V41 if requested - if (controls.getParameter("algorithm") == ALG_V41) { + if (handle->algo == SPMV_BSR_V41) { if (modeIsNoTrans || modeIsConjugate) { - return Bsr::spMatVec_no_transpose(space, controls, alpha, A, X, beta, Y, + return Bsr::spMatVec_no_transpose(space, handle, alpha, A, X, beta, Y, modeIsConjugate); } else if (modeIsTrans || modeIsConjugateTrans) { - return Bsr::spMatVec_transpose(space, controls, alpha, A, X, beta, Y, + return Bsr::spMatVec_transpose(space, handle, alpha, A, X, beta, Y, modeIsConjugateTrans); } } // use V42 if possible if (KokkosKernels::Impl::kk_is_gpu_exec_space() || - controls.getParameter("algorithm") == ALG_V42) { + handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -177,10 +180,10 @@ struct SPMV_BSRMATRIX -struct SPMV_MV_BSRMATRIX { +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; enum class Method { @@ -204,25 +208,16 @@ struct SPMV_MV_BSRMATRIXalgo == SPMV_BSR_TC) method = Method::TensorCores; if (!KokkosSparse::Experimental::Impl::TensorCoresAvailable< ExecutionSpace, AMatrix, XVector, YVector>::value) { method = Method::Fallback; @@ -249,28 +244,23 @@ struct SPMV_MV_BSRMATRIXbsr_tc_precision; switch (precision) { - case Precision::Mixed: { + case Bsr_TC_Precision::Mixed: { BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, beta, Y); return; } - case Precision::Double: { + case Bsr_TC_Precision::Double: { BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, beta, Y); return; } - case Precision::Automatic: // fallthrough + case Bsr_TC_Precision::Automatic: // fallthrough default: { constexpr bool operandsHalfHalfFloat = std::is_same::value && @@ -312,19 +302,19 @@ struct SPMV_MV_BSRMATRIXalgo == SPMV_BSR_V41) { if (modeIsNoTrans || modeIsConjugate) { - return Bsr::spMatMultiVec_no_transpose(space, controls, alpha, A, X, - beta, Y, modeIsConjugate); + return Bsr::spMatMultiVec_no_transpose(space, handle, alpha, A, X, beta, + Y, modeIsConjugate); } else if (modeIsTrans || modeIsConjugateTrans) { - return Bsr::spMatMultiVec_transpose(space, controls, alpha, A, X, beta, - Y, modeIsConjugateTrans); + return Bsr::spMatMultiVec_transpose(space, handle, alpha, A, X, beta, Y, + modeIsConjugateTrans); } } // use V42 if possible if (KokkosKernels::Impl::kk_is_gpu_exec_space() || - controls.getParameter("algorithm") == ALG_V42) { + handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -333,10 +323,10 @@ struct SPMV_MV_BSRMATRIX -struct SPMV_MV_BSRMATRIX { +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_mv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &X, - const YScalar &beta, const YVector &Y) { + static void spmv_mv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); - typedef SPMV_BSRMATRIX impl_type; - impl_type::spmv_bsrmatrix(space, controls, mode, alpha, A, x_j, beta, - y_j); + impl_type::spmv_bsrmatrix(space, handle, mode, alpha, A, x_j, beta, y_j); } } }; @@ -387,6 +376,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -405,6 +397,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -426,6 +421,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -444,6 +442,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 4f90002a61..5f9cbea040 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -24,6 +24,7 @@ #include "KokkosBlas1_scal.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spmv_handle.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" #include "KokkosSparse_spmv_impl_merge.hpp" #include "KokkosKernels_Error.hpp" @@ -249,16 +250,15 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, // spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or // trivial serial impl used) -template ()>::type* = nullptr> -static void spmv_beta_no_transpose( - const execution_space& exec, - const KokkosKernels::Experimental::Controls& controls, - typename YVector::const_value_type& alpha, const AMatrix& A, - const XVector& x, typename YVector::const_value_type& beta, - const YVector& y) { +static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, + typename YVector::const_value_type& alpha, + const AMatrix& A, const XVector& x, + typename YVector::const_value_type& beta, + const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; if (A.numRows() <= static_cast(0)) { @@ -363,15 +363,8 @@ static void spmv_beta_no_transpose( } #endif - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; SPMV_Functor func(alpha, A, x, beta, y, 1); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) @@ -389,47 +382,26 @@ static void spmv_beta_no_transpose( } // spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used) -template ()>::type* = nullptr> -static void spmv_beta_no_transpose( - const execution_space& exec, - const KokkosKernels::Experimental::Controls& controls, - typename YVector::const_value_type& alpha, const AMatrix& A, - const XVector& x, typename YVector::const_value_type& beta, - const YVector& y) { +static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, + typename YVector::const_value_type& alpha, + const AMatrix& A, const XVector& x, + typename YVector::const_value_type& beta, + const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; if (A.numRows() <= static_cast(0)) { return; } - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } - int team_size = -1; - int vector_length = -1; - int64_t rows_per_thread = -1; - - // Note on 03/24/20, lbv: We can use the controls - // here to allow the user to pass in some tunning - // parameters. - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } - if (controls.isParameter("rows per thread")) { - rows_per_thread = std::stoll(controls.getParameter("rows per thread")); - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = handle->team_size; + int vector_length = handle->vector_length; + int64_t rows_per_thread = handle->rows_per_thread; int64_t rows_per_team = spmv_launch_parameters( A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length); @@ -622,30 +594,29 @@ static void spmv_beta_transpose(const execution_space& exec, op); } -template -static void spmv_beta(const execution_space& exec, - const KokkosKernels::Experimental::Controls& controls, +template +static void spmv_beta(const execution_space& exec, Handle* handle, const char mode[], typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { + if (handle->algo == SPMV_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, handle, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { - if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { + if (handle->algo == SPMV_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, handle, alpha, A, x, beta, y); } } else if (mode[0] == Transpose[0]) { spmv_beta_transpose #include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_handle.hpp" // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include @@ -30,11 +30,13 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_eti_spec_avail { enum : bool { value = false }; }; -template > struct spmv_mv_eti_spec_avail { @@ -50,6 +52,9 @@ struct spmv_mv_eti_spec_avail { template <> \ struct spmv_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -70,6 +75,9 @@ struct spmv_mv_eti_spec_avail { template <> \ struct spmv_mv_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -100,17 +108,16 @@ namespace Impl { /// /// For the implementation of KokkosSparse::spmv for multivectors (2-D /// Views), see the SPMV_MV struct below. -template < - class ExecutionSpace, class AMatrix, class XVector, class YVector, - bool tpl_spec_avail = - spmv_tpl_spec_avail::value, - bool eti_spec_avail = - spmv_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = spmv_eti_spec_avail< + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -140,18 +147,18 @@ struct SPMV { /// matrix's entries have integer type. Per Github Issue #700, we /// don't optimize as heavily for that case, in order to reduce build /// times and library sizes. -template , - bool tpl_spec_avail = spmv_mv_tpl_spec_avail::value, - bool eti_spec_avail = spmv_mv_eti_spec_avail::value> + bool tpl_spec_avail = spmv_mv_tpl_spec_avail< + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value, + bool eti_spec_avail = spmv_mv_eti_spec_avail< + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV_MV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& controls, + static void spmv_mv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -160,78 +167,101 @@ struct SPMV_MV { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spmv for single vectors (1-D Views). // Unification layer -template -struct SPMV +struct SPMV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::ArithTraits KAT; - if (alpha == KAT::zero()) { - if (beta != KAT::one()) { - KokkosBlas::scal(space, y, beta, y); - } - return; - } - if (beta == KAT::zero()) { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } else { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } } }; //! Full specialization of spmv_mv for single vectors (2-D Views). // Unification layer -template -struct SPMV_MV +struct SPMV_MV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::ArithTraits KAT; - - if (alpha == KAT::zero()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } else if (alpha == KAT::one()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } else if (alpha == -KAT::one()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); + // Intercept special case: if x/y have only 1 column and both are + // contiguous, use the more efficient single-vector impl. + // + // We cannot do this if x or y is noncontiguous, because the column subview + // must be LayoutStride which is not ETI'd. + // + // Do not use a TPL even if one is available for the types: + // we don't want the same handle being used in both TPL and non-TPL versions + if (x.extent(1) == size_t(1) && x.span_is_contiguous() && + y.span_is_contiguous()) { + Kokkos::View + x0(x.data(), x.extent(0)); + Kokkos::View + y0(y.data(), y.extent(0)); + if (beta == KAT::zero()) { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } else if (beta == KAT::one()) { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } else if (beta == -KAT::one()) { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } else { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } } else { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); + if (alpha == KAT::zero()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else if (alpha == KAT::one()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else if (alpha == -KAT::one()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } } } }; -template -struct SPMV_MV +struct SPMV_MV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { @@ -241,9 +271,10 @@ struct SPMV_MV + typedef SPMV impl_type; - impl_type::spmv(space, defaultControls, mode, alpha, A, x_j, beta, y_j); + impl_type::spmv(space, handle, mode, alpha, A, x_j, beta, y_j); } } }; @@ -264,6 +295,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -282,6 +316,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -300,6 +337,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -318,6 +358,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index e0d6e61a3b..db9ef71753 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -1108,6 +1108,10 @@ template struct is_bsr_matrix> : public std::true_type {}; template struct is_bsr_matrix> : public std::true_type {}; + +/// \brief Equivalent to is_bsr_matrix::value. +template +inline constexpr bool is_bsr_matrix_v = is_bsr_matrix::value; //---------------------------------------------------------------------------- } // namespace Experimental diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index 7070172a1f..ce9ec99e4e 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -867,5 +867,9 @@ struct is_crs_matrix> : public std::true_type {}; template struct is_crs_matrix> : public std::true_type {}; +/// \brief Equivalent to is_crs_matrix::value. +template +inline constexpr bool is_crs_matrix_v = is_crs_matrix::value; + } // namespace KokkosSparse #endif diff --git a/sparse/src/KokkosSparse_Utils_mkl.hpp b/sparse/src/KokkosSparse_Utils_mkl.hpp index 7a8dd0cb22..2f9437c650 100644 --- a/sparse/src/KokkosSparse_Utils_mkl.hpp +++ b/sparse/src/KokkosSparse_Utils_mkl.hpp @@ -88,11 +88,58 @@ struct mkl_is_supported_value_type> : std::true_type {}; template <> struct mkl_is_supported_value_type> : std::true_type {}; +// Helper to: +// - define the MKL type equivalent to a given Kokkos scalar type +// - provide an easy implicit conversion to that MKL type +template +struct KokkosToMKLScalar { + static_assert(mkl_is_supported_value_type::value, + "Scalar type not supported by MKL"); + using type = Scalar; + KokkosToMKLScalar(Scalar val_) : val(val_) {} + operator Scalar() const { return val; } + Scalar val; +}; + +template <> +struct KokkosToMKLScalar> { + using type = MKL_Complex8; + KokkosToMKLScalar(Kokkos::complex val_) : val(val_) {} + operator MKL_Complex8() const { return {val.real(), val.imag()}; } + Kokkos::complex val; +}; + +template <> +struct KokkosToMKLScalar> { + using type = MKL_Complex16; + KokkosToMKLScalar(Kokkos::complex val_) : val(val_) {} + operator MKL_Complex16() const { return {val.real(), val.imag()}; } + Kokkos::complex val; +}; + +template +struct KokkosToOneMKLScalar { + // Note: we happen to use the same set of types in classic MKL and OneMKL. + // If that changes, update this logic. + static_assert(mkl_is_supported_value_type::value, + "Scalar type not supported by OneMKL"); + using type = Scalar; +}; + +template +struct KokkosToOneMKLScalar> { + static_assert(mkl_is_supported_value_type>::value, + "Scalar type not supported by OneMKL"); + using type = std::complex; +}; + // MKLSparseMatrix provides thin wrapper around MKL matrix handle // (sparse_matrix_t) and encapsulates MKL call dispatches related to details // like value_type, allowing simple client code in kernels. template class MKLSparseMatrix { + static_assert(mkl_is_supported_value_type::value, + "Provided value_type type not supported by MKL"); sparse_matrix_t mtx; public: @@ -100,11 +147,7 @@ class MKLSparseMatrix { // Constructs MKL sparse matrix from KK sparse views (m rows x n cols) inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols, - MKL_INT *xadj, MKL_INT *adj, value_type *values) { - throw std::runtime_error( - "Scalar type used in MKLSparseMatrix is NOT " - "supported by MKL"); - } + MKL_INT *xadj, MKL_INT *adj, value_type *values) {} // Allows using MKLSparseMatrix directly in MKL calls inline operator sparse_matrix_t() const { return mtx; } @@ -112,11 +155,7 @@ class MKLSparseMatrix { // Exports MKL sparse matrix contents into KK views inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start, MKL_INT *&columns, - value_type *&values) { - throw std::runtime_error( - "Scalar type used in MKLSparseMatrix is NOT " - "supported by MKL"); - } + value_type *&values) {} inline void destroy() { KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index bd038813d1..bcff9e29e9 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -22,7 +22,7 @@ #define KOKKOSSPARSE_SPMV_HPP_ #include "KokkosKernels_helpers.hpp" -#include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_handle.hpp" #include "KokkosSparse_spmv_spec.hpp" #include "KokkosSparse_spmv_struct_spec.hpp" #include "KokkosSparse_spmv_bsrmatrix_spec.hpp" @@ -40,816 +40,47 @@ struct RANK_ONE {}; struct RANK_TWO {}; } // namespace -/// \brief Kokkos sparse matrix-vector multiply on single -/// vectors (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is -/// controlled by mode (see below). -/// -/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-1 Kokkos::View and its rank must match that of XVector -/// -/// \param space [in] The execution space instance on which to run the -/// kernel. -/// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_ONE dispatch -#ifdef DOXY // documentation version - don't separately document SFINAE - // specializations for BSR and CRS -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_ONE& tag) { - - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - -// Make sure that x and y have the same rank. -// Make sure that x (and therefore y) is rank 1. -#if (KOKKOS_VERSION >= 40100) - static_assert(XVector::rank() == YVector::rank(), - "KokkosSparse::spmv: Vector ranks do not match."); - - static_assert(XVector::rank() == 1, - "KokkosSparse::spmv: Both Vector inputs must have rank 1 " - "in order to call this specialization of spmv."); -#else - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); - static_assert(static_cast(XVector::rank) == 1, - "KokkosSparse::spmv: Both Vector inputs must have rank 1 " - "in order to call this specialization of spmv."); -#endif - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(x.extent(0))) || - (static_cast(A.numRows()) > static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv: Dimensions do not match: " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(y.extent(0))) || - (static_cast(A.numRows()) > static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv: Dimensions do not match (transpose): " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - - typedef KokkosSparse::CrsMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_Internal; - - AMatrix_Internal A_i = A; - XVector_Internal x_i = x; - YVector_Internal y_i = y; - - if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || - A_i.numCols() == 0 || A_i.nnz() == 0) { - // This is required to maintain semantics of KokkosKernels native SpMV: - // if y contains NaN but beta = 0, the result y should be filled with 0. - // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(space, y_i, beta, y_i); - return; - } - - // Whether to call KokkosKernel's native implementation, even if a TPL impl is - // available - bool useFallback = controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl"); - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the conjugate mode (C) - if constexpr (std::is_same_v || - std::is_same_v) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } - // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for all - // scalar types -#if defined(CUSPARSE_VER_MAJOR) && (CUSPARSE_VER_MAJOR == 12) - uintptr_t yptr = uintptr_t((void*)y.data()); - if (yptr % 16 != 0) useFallback = true; -#endif -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - if (std::is_same::value) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same_v) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } -#ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } -#endif -#endif - - if (useFallback) { - // Explicitly call the non-TPL SPMV implementation - std::string label = - "KokkosSparse::spmv[NATIVE," + - Kokkos::ArithTraits< - typename AMatrix_Internal::non_const_value_type>::name() + - "]"; - Kokkos::Profiling::pushRegion(label); - Impl::SPMV::spmv(space, controls, mode, alpha, A_i, - x_i, beta, y_i); - Kokkos::Profiling::popRegion(); - } else { - // note: the cuSPARSE spmv wrapper defines a profiling region, so one is not - // needed here. - Impl::SPMV::spmv(space, controls, mode, alpha, A_i, x_i, - beta, y_i); - } -} - -/// \brief Kokkos sparse matrix-vector multiply on single -/// vector (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is -/// controlled by mode (see below). -/// -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-1 Kokkos::View and its rank must match that of XVector -/// -/// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_ONE dispatch -#ifdef DOXY // documentation version -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} - -#ifndef DOXY // hide SFINAE specialization for BSR -template ::value>::type* = nullptr> -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_ONE& tag) { - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that x and y have the same rank. -#if (KOKKOS_VERSION >= 40100) - static_assert(XVector::rank() == YVector::rank(), - "KokkosSparse::spmv: Vector ranks do not match."); -#else - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); -#endif - // Make sure that x (and therefore y) is rank 1. - static_assert(static_cast(XVector::rank) == 1, - "KokkosSparse::spmv: Both Vector inputs must have rank 1 " - "in order to call this specialization of spmv."); - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // - if (A.blockDim() == 1) { - KokkosSparse::CrsMatrix< - typename AMatrix::value_type, typename AMatrix::ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::size_type> - Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, - RANK_ONE()); - return; - } - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(x.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " - << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(y.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match " - "(transpose): " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " - << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - // - typedef KokkosSparse::Experimental::BsrMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_Internal; - - AMatrix_Internal A_i(A); - XVector_Internal x_i(x); - YVector_Internal y_i(y); - - if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || - A_i.numCols() == 0 || A_i.nnz() == 0) { - // This is required to maintain semantics of KokkosKernels native SpMV: - // if y contains NaN but beta = 0, the result y should be filled with 0. - // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(space, y_i, beta, y_i); - return; - } - - // - // Whether to call KokkosKernel's native implementation, even if a TPL impl is - // available - bool useFallback = controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl"); - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the modes (C), (T), (H) - if (std::is_same::value || - std::is_same::value) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - // rocSparse does not support the modes (C), (T), (H) - if constexpr (std::is_same_v) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } -#endif - - if (useFallback) { - // Explicitly call the non-TPL SPMV_BSRMATRIX implementation - std::string label = - "KokkosSparse::spmv[NATIVE,BSRMATRIX," + - Kokkos::ArithTraits< - typename AMatrix_Internal::non_const_value_type>::name() + - "]"; - Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, controls, - mode, alpha, A_i, - x_i, beta, y_i); - Kokkos::Profiling::popRegion(); - } else { - constexpr bool tpl_spec_avail = - KokkosSparse::Experimental::Impl::spmv_bsrmatrix_tpl_spec_avail< - ExecutionSpace, AMatrix_Internal, XVector_Internal, - YVector_Internal>::value; - - constexpr bool eti_spec_avail = - tpl_spec_avail - ? KOKKOSKERNELS_IMPL_COMPILE_LIBRARY /* force FALSE in app/test */ - : KokkosSparse::Experimental::Impl::spmv_bsrmatrix_eti_spec_avail< - ExecutionSpace, AMatrix_Internal, XVector_Internal, - YVector_Internal>::value; - - Experimental::Impl::SPMV_BSRMATRIX< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - tpl_spec_avail, eti_spec_avail>::spmv_bsrmatrix(space, controls, mode, - alpha, A_i, x_i, beta, - y_i); - } -} - -template ::value>::type* = nullptr> -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} -#endif // ifndef DOXY - -namespace Impl { -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y); - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y); -}; - -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); - return true; - } - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(space, mode, alpha, A, x, beta, y); - return true; - } -}; - -#else - -template -struct SPMV2D1D { - static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, - const BetaType& /*beta*/, const YVector& /*y*/) { - return false; - } - - template - static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], - const AlphaType& /*alpha*/, const AMatrix& /*A*/, - const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - return false; - } -}; -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY) -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); - return true; - } - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(space, mode, alpha, A, x, beta, y); - return true; - } -}; - -#else - -template -struct SPMV2D1D { - static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, - const BetaType& /*beta*/, const YVector& /*y*/) { - return false; - } - - template - static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], - const AlphaType& /*alpha*/, const AMatrix& /*A*/, - const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - return false; - } -}; -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY) -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); - return true; - } - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(space, mode, alpha, A, x, beta, y); - return true; - } -}; - -#else - -template -struct SPMV2D1D { - static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, - const BetaType& /*beta*/, const YVector& /*y*/) { - return false; - } - - template - static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], - const AlphaType& /*alpha*/, const AMatrix& /*A*/, - const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - return false; - } -}; -#endif -} // namespace Impl - -template -using SPMV2D1D - [[deprecated("KokkosSparse::SPMV2D1D is not part of the public interface - " - "use KokkosSparse::spmv instead")]] = - Impl::SPMV2D1D; - -/// \brief Kokkos sparse matrix-vector multiply on multivectors -/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +// clang-format off +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is /// controlled by mode (see below). /// /// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. +/// the memory spaces of A, x, and y. Must match Handle::ExecutionSpaceType. +/// \tparam Handle Specialization of KokkosSparse::SPMVHandle /// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector +/// YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix. Must be identical to Handle::AMatrixType. +/// \tparam XVector Type of x, must be a rank-1 or 2 Kokkos::View. Must be identical to Handle::XVectorType. +/// \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. +/// \tparam YVector Type of y, must be a rank-1 or 2 Kokkos::View and its rank must match that of XVector. Must +/// be identical to Handle::YVectorType. /// -/// \param space [in] The execution space instance on which to run the -/// kernel. -/// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_TWO dispatch -#ifdef DOXY // documentation version -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_TWO& tag) { - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); -// Make sure that x and y have the same rank. -#if (KOKKOS_VERSION >= 40100) - static_assert(XVector::rank() == YVector::rank(), - "KokkosSparse::spmv: Vector ranks do not match."); -#else - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); -#endif - // Make sure that x (and therefore y) is rank 2. - static_assert(static_cast(XVector::rank) == 2, - "KokkosSparse::spmv: Both Vector inputs must have rank 2 " - "in order to call this specialization of spmv."); - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(x.extent(0))) || - (static_cast(A.numRows()) > static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match: " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(y.extent(0))) || - (static_cast(A.numRows()) > static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match (transpose): " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - - typedef KokkosSparse::CrsMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - AMatrix_Internal A_i = A; - - // Call single-vector version if appropriate - if (x.extent(1) == 1) { - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_SubInternal; - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_SubInternal; - - XVector_SubInternal x_i = Kokkos::subview(x, Kokkos::ALL(), 0); - YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0); - - // spmv (mode, alpha, A, x_i, beta, y_i); - using impl_type = - Impl::SPMV2D1D; - if (impl_type::spmv2d1d(space, mode, alpha, A, x_i, beta, y_i)) { - return; - } - } - { - typedef Kokkos::View< - typename XVector::const_value_type**, typename XVector::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View > - YVector_Internal; - - XVector_Internal x_i = x; - YVector_Internal y_i = y; - - bool useNative = false; - -// cusparseSpMM does not support conjugate mode -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - useNative = useNative || (Conjugate[0] == mode[0]); -#endif - useNative = useNative || (controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl")); - - if (useNative) { - return Impl::SPMV_MV< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - std::is_integral::value, - false>::spmv_mv(space, controls, mode, alpha, A_i, x_i, beta, y_i); - } else { - return Impl::SPMV_MV::spmv_mv(space, controls, mode, - alpha, A_i, x_i, beta, - y_i); - } - } -} - -/// \brief Kokkos sparse matrix-vector multiply on multivectors -/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is -/// controlled by mode (see below). -/// -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector -/// -/// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_TWO dispatch -#ifdef DOXY -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} - -#ifndef DOXY // hide SFINAE -template ::value>::type* = nullptr> -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], +/// \param space [in] The execution space instance on which to run the +/// kernel. +/// \param handle [in/out] a pointer to a KokkosSparse::SPMVHandle. On the first call to spmv with +/// a given handle instance, the handle's internal data will be initialized automatically. +/// On all later calls to spmv, this internal data will be reused. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. If handle has previously been passed to spmv, A must be identical to the +/// A passed in to that first call. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +// clang-format on +template +void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_TWO& tag) { + const BetaType& beta, const YVector& y) { + // Make sure A is a CrsMatrix or BsrMatrix. + static_assert( + is_crs_matrix_v || Experimental::is_bsr_matrix_v, + "KokkosSparse::spmv: AMatrix must be a CrsMatrix or BsrMatrix"); // Make sure that x and y are Views. static_assert(Kokkos::is_view::value, "KokkosSparse::spmv: XVector must be a Kokkos::View."); @@ -859,459 +90,422 @@ void spmv(const ExecutionSpace& space, static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + "KokkosSparse::spmv: AMatrix must be accessible from ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + "KokkosSparse::spmv: XVector must be accessible from ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); + "KokkosSparse::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that x and y have the same rank. - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); - // Make sure that x (and therefore y) is rank 2. - static_assert(static_cast(XVector::rank) == 2, - "KokkosSparse::spmv: Both Vector inputs must have rank 2 " - "in order to call this specialization of spmv."); + static_assert(XVector::rank() == YVector::rank(), + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that x (and therefore y) is rank 1 or 2. + static_assert(XVector::rank() == size_t(1) || XVector::rank() == size_t(2), + "KokkosSparse::spmv: Both Vector inputs must have rank 1 or 2"); // Make sure that y is non-const. - static_assert(std::is_same::value, + static_assert(!std::is_const_v, "KokkosSparse::spmv: Output Vector must be non-const."); - // - if (A.blockDim() == 1) { - KokkosSparse::CrsMatrix< - typename AMatrix::value_type, typename AMatrix::ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::size_type> - Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, - RANK_TWO()); - return; - } + // Check that A, X, Y types match that of the Handle + static_assert( + std::is_same_v, + "KokkosSparse::spmv: AMatrix must be identical to Handle::AMatrixType"); + static_assert( + std::is_same_v, + "KokkosSparse::spmv: XVector must be identical to Handle::XVectorType"); + static_assert( + std::is_same_v, + "KokkosSparse::spmv: YVector must be identical to Handle::YVectorType"); + + constexpr bool isBSR = Experimental::is_bsr_matrix_v; + // Check compatibility of dimensions at run time. + size_t m, n; + + if constexpr (!isBSR) { + m = A.numRows(); + n = A.numCols(); + } else { + m = A.numRows() * A.blockDim(); + n = A.numCols() * A.blockDim(); + } + if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(x.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(y.extent(0)))) { + if ((x.extent(1) != y.extent(1)) || (n != x.extent(0)) || + (m != y.extent(0))) { std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + os << "KokkosSparse::spmv: Dimensions do not match: " + << ", A: " << m << " x " << n << ", x: " << x.extent(0) << " x " << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); } } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(y.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(x.extent(0)))) { + if ((x.extent(1) != y.extent(1)) || (m != x.extent(0)) || + (n != y.extent(0))) { std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match " - "(transpose): " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " - << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - + os << "KokkosSparse::spmv: Dimensions do not match (transpose): " + << ", A: " << A.numRows() << " x " << A.numCols() + << ", x: " << x.extent(0) << " x " << x.extent(1) + << ", y: " << y.extent(0) << " x " << y.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } } - // - typedef KokkosSparse::Experimental::BsrMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - AMatrix_Internal A_i(A); - typedef Kokkos::View< - typename XVector::const_value_type**, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - XVector_Internal x_i(x); - - typedef Kokkos::View< - typename YVector::non_const_value_type**, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_Internal; - YVector_Internal y_i(y); - // - if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || - A_i.numCols() == 0 || A_i.nnz() == 0) { + // Efficiently handle cases where alpha*Op(A) is equivalent to the zero matrix + if (alpha == Kokkos::ArithTraits::zero() || m == 0 || n == 0 || + A.nnz() == 0) { // This is required to maintain semantics of KokkosKernels native SpMV: // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(space, y_i, beta, y_i); + KokkosBlas::scal(space, y, beta, y); return; } - // - // Call single-vector version if appropriate - // - if (x.extent(1) == 1) { - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_SubInternal; - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_SubInternal; - XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0); - YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0); + using HandleImpl = typename Handle::ImplType; - return spmv(space, controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); - } - // - // Whether to call KokkosKernel's native implementation, even if a TPL impl is - // available - bool useFallback = controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl"); + using ACrs_Internal = CrsMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type>; + using ABsr_Internal = Experimental::BsrMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type>; + using AMatrix_Internal = + std::conditional_t; + + AMatrix_Internal A_i(A); + + // Note: data_type of a View includes both the scalar and rank + using XVector_Internal = Kokkos::View< + typename XVector::const_data_type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits>; + + using YVector_Internal = Kokkos::View< + typename YVector::non_const_data_type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; + + XVector_Internal x_i(x); + YVector_Internal y_i(y); + + bool useNative = is_spmv_algorithm_native(handle->get_algorithm()); + // Also use the native algorithm if SPMV_FAST_SETUP was selected and + // rocSPARSE is the possible TPL to use. Native is faster in this case. +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if (handle->get_algorithm() == SPMV_FAST_SETUP && + std::is_same_v) + useNative = true; +#endif + + // Now call the proper implementation depending on isBSR and the rank of X/Y + if constexpr (!isBSR) { + if constexpr (XVector::rank() == 1) { +///////////////// +// CRS, rank 1 // +///////////////// #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the modes (C), (T), (H) - if (std::is_same::value || - std::is_same::value) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } + // cuSPARSE does not support the conjugate mode (C) + if constexpr (std::is_same_v || + std::is_same_v) { + useNative = useNative || (mode[0] == Conjugate[0]); + } + // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for + // all scalar types +#if defined(CUSPARSE_VER_MAJOR) && (CUSPARSE_VER_MAJOR == 12) + uintptr_t yptr = uintptr_t((void*)y.data()); + if (yptr % 16 != 0) useNative = true; +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if (std::is_same::value) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } + if (std::is_same_v) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#ifdef KOKKOS_ENABLE_SYCL + if (std::is_same_v) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#endif +#endif + if (useNative) { + // Explicitly call the non-TPL SPMV implementation + std::string label = + "KokkosSparse::spmv[NATIVE," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Impl::SPMV::spmv(space, + handle, + mode, alpha, + A_i, x_i, + beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + // note: the cuSPARSE spmv wrapper defines a profiling region, so one is + // not needed here. + Impl::SPMV::spmv(space, handle, + mode, alpha, A_i, + x_i, beta, y_i); + } + } else { +///////////////// +// CRS, rank 2 // +///////////////// +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + useNative = useNative || (Conjugate[0] == mode[0]); #endif - if (useFallback) { - // Explicitly call the non-TPL SPMV_BSRMATRIX implementation - std::string label = - "KokkosSparse::spmv[NATIVE,BSMATRIX," + - Kokkos::ArithTraits< - typename AMatrix_Internal::non_const_value_type>::name() + - "]"; - Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_MV_BSRMATRIX< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - std::is_integral::value, - false>::spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, - y_i); - Kokkos::Profiling::popRegion(); + if (useNative) { + std::string label = + "KokkosSparse::spmv[NATIVE,MV," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + return Impl::SPMV_MV< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal, + std::is_integral::value, + false>::spmv_mv(space, handle, mode, alpha, A_i, x_i, beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + return Impl::SPMV_MV::spmv_mv(space, handle, mode, + alpha, A_i, x_i, beta, + y_i); + } + } } else { - Experimental::Impl::SPMV_MV_BSRMATRIX< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - std::is_integral::value>:: - spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, y_i); - } -} - -template ::value>::type* = nullptr> -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} + if constexpr (XVector::rank() == 1) { +///////////////// +// BSR, rank 1 // +///////////////// +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE does not support the modes (C), (T), (H) + if (std::is_same::value || + std::is_same::value) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } #endif -/// \brief Public interface to local sparse matrix-vector multiply. -/// -/// Compute y := beta*y + alpha*Op(A)*x, where x and y are either both -/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View -/// instances, and Op(A) is determined -/// by \c mode. If beta == 0, ignore and overwrite the initial -/// entries of y; if alpha == 0, ignore the entries of A and x. -/// -/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have -/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on -/// Volta or Ampere architectures. On Volta-architecture GPUs the only available -/// precision is mixed-precision fp32 accumulator from fp16 inputs. On -/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, -/// x is fp16, and y is fp32. Otherwise, double-precision is used. The caller -/// may override this by setting the \c "tc_precision" = \c "mixed" or -/// \c "double" as desired. -/// -/// For mixed precision, performance will degrade for blockDim < 16. -/// For double precision, for blockDim < 8. -/// For such cases, consider an alternate SpMV algorithm. -/// -/// May have \c "algorithm" set to \c "native" to bypass TPLs if they are -/// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a -/// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. -/// -/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank 1 or 2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank 1 or 2 Kokkos::View and its rank must match that of XVector -/// -/// \param space [in] The execution space instance on which to run the -/// kernel. -/// \param controls [in] kokkos-kernels control structure -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). -/// \param beta [in] Scalar multiplier for the (multi)vector y. -/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). It must have the same number -/// of columns as x. -template -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y) { - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that both x and y have the same rank. - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numPointCols()) != - static_cast(x.extent(0))) || - (static_cast(A.numPointRows()) != - static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (Generic): Dimensions do not match: " - << ", A: " << A.numPointRows() << " x " << A.numPointCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (std::is_same::value) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#endif - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numPointCols()) != - static_cast(y.extent(0))) || - (static_cast(A.numPointRows()) != - static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (Generic): Dimensions do not match " - "(transpose): " - << ", A: " << A.numPointRows() << " x " << A.numPointCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + // rocSparse does not support the modes (C), (T), (H) + if constexpr (std::is_same_v) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } +#endif + if (useNative) { + // Explicitly call the non-TPL SPMV_BSRMATRIX implementation + std::string label = + "KokkosSparse::spmv[NATIVE,BSRMATRIX," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Experimental::Impl::SPMV_BSRMATRIX< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal, false>::spmv_bsrmatrix(space, handle, mode, alpha, + A_i, x_i, beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + Experimental::Impl::SPMV_BSRMATRIX< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal>::spmv_bsrmatrix(space, handle, mode, alpha, A_i, + x_i, beta, y_i); + } + } else { + ///////////////// + // BSR, rank 2 // + ///////////////// +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE does not support the modes (C), (T), (H) + if (std::is_same::value || + std::is_same::value) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } +#endif - KokkosKernels::Impl::throw_runtime_exception(os.str()); +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (std::is_same::value) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#endif + if (useNative) { + // Explicitly call the non-TPL SPMV_BSRMATRIX implementation + std::string label = + "KokkosSparse::spmv[NATIVE,MV,BSMATRIX," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Experimental::Impl::SPMV_MV_BSRMATRIX< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal, + std::is_integral< + typename AMatrix_Internal::const_value_type>::value, + false>::spmv_mv_bsrmatrix(space, handle, mode, alpha, A_i, x_i, + beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + Experimental::Impl::SPMV_MV_BSRMATRIX< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal, + std::is_integral:: + value>::spmv_mv_bsrmatrix(space, handle, mode, alpha, A_i, x_i, + beta, y_i); + } } } - - if (alpha == Kokkos::ArithTraits::zero() || A.numRows() == 0 || - A.numCols() == 0 || A.nnz() == 0) { - // This is required to maintain semantics of KokkosKernels native SpMV: - // if y contains NaN but beta = 0, the result y should be filled with 0. - // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(space, y, beta, y); - return; - } - // - using RANK_SPECIALISE = - typename std::conditional(XVector::rank) == 2, RANK_TWO, - RANK_ONE>::type; - spmv(space, controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE()); } -/// \brief Public interface to local sparse matrix-vector multiply. -/// -/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both -/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View -/// instances, and Op(A) is determined -/// by \c mode. If beta == 0, ignore and overwrite the initial -/// entries of y; if alpha == 0, ignore the entries of A and x. -/// -/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have -/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on -/// Volta or Ampere architectures. On Volta-architecture GPUs the only available -/// precision is mixed-precision fp32 accumulator from fp16 inputs. On -/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, -/// x is fp16, and y is fp32. Otherwise, double-precision is used. The caller -/// may override this by setting the \c "tc_precision" = \c "mixed" or -/// \c "double" as desired. -/// -/// For mixed precision, performance will degrade for blockDim < 16. -/// For double precision, for blockDim < 8. -/// For such cases, consider an alternate SpMV algorithm. -/// -/// May have \c "algorithm" set to \c "native" to bypass TPLs if they are -/// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a -/// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. +// clang-format off +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode +/// (see below). /// -/// \tparam AMatrix KokkosSparse::CrsMatrix or -/// KokkosSparse::Experimental::BsrMatrix +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or KokkosSparse::Experimental::BsrMatrix +/// \tparam XVector Type of x, must be a rank-1 or rank-2 Kokkos::View +/// \tparam BetaType Type of coefficient beta. Must be convertible to YVector::value_type. +/// \tparam YVector Type of y, must be a Kokkos::View and its rank must match that of XVector /// -/// \param controls [in] kokkos-kernels control structure -/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" -/// for conjugate transpose. +/// \param space [in] The execution space instance on which to run the kernel. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. /// \param alpha [in] Scalar multiplier for the matrix A. /// \param A [in] The sparse matrix A. -/// \param x [in] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). -/// \param beta [in] Scalar multiplier for the (multi)vector y. -/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). It must have the same number -/// of columns as x. -template -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y); -} - -#ifndef DOXY -/// \brief Catch-all public interface to error on invalid Kokkos::Sparse spmv -/// argument types -/// -/// This is a catch-all interface that throws a compile-time error if \c -/// AMatrix is not a CrsMatrix, or BsrMatrix -/// -template ::value && - !KokkosSparse::is_crs_matrix::value>::type* = nullptr> -void spmv(KokkosKernels::Experimental::Controls /*controls*/, - const char[] /*mode*/, const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - // have to arrange this so that the compiler can't tell this is false until - // instantiation - static_assert(KokkosSparse::is_crs_matrix::value || - KokkosSparse::Experimental::is_bsr_matrix::value, - "SpMV: AMatrix must be CrsMatrix or BsrMatrix"); -} - -/// \brief Catch-all public interface to error on invalid Kokkos::Sparse spmv -/// argument types -/// -/// This is a catch-all interface that throws a compile-time error if \c -/// AMatrix is not a CrsMatrix, or BsrMatrix -/// +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +// clang-format on template ::value && - !KokkosSparse::is_crs_matrix::value>::type* = nullptr> -void spmv(const ExecutionSpace& /* space */, - KokkosKernels::Experimental::Controls /*controls*/, - const char[] /*mode*/, const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - // have to arrange this so that the compiler can't tell this is false until - // instantiation - static_assert(KokkosSparse::is_crs_matrix::value || - KokkosSparse::Experimental::is_bsr_matrix::value, - "SpMV: AMatrix must be CrsMatrix or BsrMatrix"); + typename = std::enable_if_t< + Kokkos::is_execution_space::value>> +void spmv(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + SPMVAlgorithm algo = SPMV_FAST_SETUP; + // Without handle reuse, native is overall faster than rocSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if constexpr (std::is_same_v) + algo = SPMV_NATIVE; +#endif + SPMVHandle + handle(algo); + spmv(space, &handle, mode, alpha, A, x, beta, y); } -#endif // ifndef DOXY +// clang-format off /// \brief Kokkos sparse matrix-vector multiply. /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode /// (see below). /// +/// \tparam Handle Specialization of KokkosSparse::SPMVHandle /// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector -/// +/// YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix. Must be identical to Handle::AMatrixType. +/// \tparam XVector Type of x. Must be a rank-1 or 2 Kokkos::View and be identical to Handle::XVectorType. +/// \tparam BetaType Type of coefficient beta. Must be convertible to YVector::value_type. +/// \tparam YVector Type of y. Must have the same rank as XVector and be identical to Handle::YVectorType. +/// +/// \param handle [in/out] a pointer to a KokkosSparse::SPMVHandle. On the first call to spmv with +/// a given handle instance, the handle's internal data will be initialized automatically. +/// On all later calls to spmv, this internal data will be reused. /// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// transpose, "C" for conjugate or "H" for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. /// \param x [in] A vector to multiply on the left by A. /// \param beta [in] Scalar multiplier for the vector y. /// \param y [in/out] Result vector. -template -void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, const YVector& y) { - KokkosKernels::Experimental::Controls controls; - spmv(controls, mode, alpha, A, x, beta, y); +// clang-format on +template < + class Handle, class AlphaType, class AMatrix, class XVector, class BetaType, + class YVector, + typename = std::enable_if_t::value>> +void spmv(Handle* handle, const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename Handle::ExecutionSpaceType(), handle, mode, alpha, A, x, beta, + y); } +// clang-format off /// \brief Kokkos sparse matrix-vector multiply. /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode /// (see below). /// -/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or KokkosSparse::Experimental::BsrMatrix +/// \tparam XVector Type of x, must be a rank-1 or rank-2 Kokkos::View +/// \tparam BetaType Type of coefficient beta. Must be convertible to YVector::value_type. +/// \tparam YVector Type of y, must be a Kokkos::View and its rank must match that of XVector /// -/// \param space [in] The execution space instance on which to run the -/// kernel. /// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// transpose, "C" for conjugate or "H" for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. /// \param x [in] A vector to multiply on the left by A. /// \param beta [in] Scalar multiplier for the vector y. /// \param y [in/out] Result vector. -template -void spmv(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y) { - KokkosKernels::Experimental::Controls controls; - spmv(space, controls, mode, alpha, A, x, beta, y); +// clang-format on +template +void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, const YVector& y) { + SPMVAlgorithm algo = SPMV_FAST_SETUP; + // Without handle reuse, native is overall faster than rocSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if constexpr (std::is_same_v) + algo = SPMV_NATIVE; +#endif + SPMVHandle + handle(algo); + spmv(typename AMatrix::execution_space(), &handle, mode, alpha, A, x, beta, + y); } namespace Experimental { @@ -1332,17 +526,17 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "KokkosSparse::spmv_struct: AMatrix must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: XVector must be accessible from " + "KokkosSparse::spmv_struct: XVector must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: YVector must be accessible from " + "KokkosSparse::spmv_struct: YVector must be accessible from " "ExecutionSpace"); // Make sure that x (and therefore y) is rank 1. static_assert( @@ -1391,13 +585,13 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename XVector::device_type, - Kokkos::MemoryTraits > + Kokkos::MemoryTraits> XVector_Internal; typedef Kokkos::View< typename YVector::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > + typename YVector::device_type, Kokkos::MemoryTraits> YVector_Internal; AMatrix_Internal A_i = A; @@ -1627,25 +821,25 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "KokkosSparse::spmv_struct: AMatrix must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: XVector must be accessible from " + "KokkosSparse::spmv_struct: XVector must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: YVector must be accessible from " + "KokkosSparse::spmv_struct: YVector must be accessible from " "ExecutionSpace"); // Make sure that both x and y have the same rank. static_assert(XVector::rank == YVector::rank, - "KokkosBlas::spmv: Vector ranks do not match."); + "KokkosSparse::spmv: Vector ranks do not match."); // Make sure that y is non-const. static_assert(std::is_same::value, - "KokkosBlas::spmv: Output Vector must be non-const."); + "KokkosSparse::spmv: Output Vector must be non-const."); // Check compatibility of dimensions at run time. if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { @@ -1653,7 +847,7 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], (static_cast(A.numCols()) > static_cast(x.extent(0))) || (static_cast(A.numRows()) > static_cast(y.extent(0)))) { std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match: " + os << "KokkosSparse::spmv: Dimensions do not match: " << ", A: " << A.numRows() << " x " << A.numCols() << ", x: " << x.extent(0) << " x " << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); @@ -1664,7 +858,7 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], (static_cast(A.numCols()) > static_cast(y.extent(0))) || (static_cast(A.numRows()) > static_cast(x.extent(0)))) { std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match (transpose): " + os << "KokkosSparse::spmv: Dimensions do not match (transpose): " << ", A: " << A.numRows() << " x " << A.numCols() << ", x: " << x.extent(0) << " x " << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); @@ -1685,11 +879,11 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], typedef Kokkos::View< typename XVector::const_value_type*, typename YVector::array_layout, typename XVector::device_type, - Kokkos::MemoryTraits > + Kokkos::MemoryTraits> XVector_SubInternal; typedef Kokkos::View< typename YVector::non_const_value_type*, typename YVector::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > + typename YVector::device_type, Kokkos::MemoryTraits> YVector_SubInternal; XVector_SubInternal x_i = Kokkos::subview(x, Kokkos::ALL(), 0); @@ -1706,28 +900,7 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], } // Call true rank 2 vector implementation - { - typedef Kokkos::View< - typename XVector::const_value_type**, typename XVector::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View > - YVector_Internal; - - XVector_Internal x_i = x; - YVector_Internal y_i = y; - - return KokkosSparse::Impl::SPMV_MV< - ExecutionSpace, AMatrix_Internal, XVector_Internal, - YVector_Internal>::spmv_mv(space, - KokkosKernels::Experimental::Controls(), - mode, alpha, A_i, x_i, beta, y_i); - } + spmv(space, mode, alpha, A, x, beta, y); } template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y); + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y); +}; + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) +template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); + return true; + } + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + +#else + +template +struct SPMV2D1D { + static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } + + template + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } +}; +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY) +template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); + return true; + } + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + +#else + +template +struct SPMV2D1D { + static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } + + template + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } +}; +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY) +template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); + return true; + } + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + +#else + +template +struct SPMV2D1D { + static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } + + template + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } +}; +#endif +} // namespace Impl + +template +using SPMV2D1D + [[deprecated("KokkosSparse::SPMV2D1D is not part of the public interface - " + "use KokkosSparse::spmv instead")]] = + Impl::SPMV2D1D; + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + // Default to fast setup, since this handle can't be reused + SPMVAlgorithm algo = SPMV_FAST_SETUP; + // Translate the Controls algorithm selection to the SPMVHandle algorithm. + // This maintains the old behavior, where any manually set name that isn't + // "tpl" gives native. + // + // This also uses the behavior set by #2021: "merge" was a hint to use + // cuSPARSE merge path, but that path is gone so just use the normal TPL. + // "merge-path" means to use the KK merge-path implementation. + // + // And also support the 3 different BSR algorithms by their old names. + if (controls.isParameter("algorithm")) { + std::string algoName = controls.getParameter("algorithm"); + if (algoName == "merge" || algoName == "tpl") + algo = SPMV_FAST_SETUP; + else if (algoName == "native-merge") + algo = SPMV_MERGE_PATH; + else if (algoName == "v4.1") + algo = SPMV_BSR_V41; + else if (algoName == "v4.2") + algo = SPMV_BSR_V41; + else if (algoName == "experimental_bsr_tc" || algoName == "experimental_tc") + algo = SPMV_BSR_TC; + else + throw std::invalid_argument( + std::string("KokkosSparse::spmv: controls algorithm name '") + + algoName + "' is not supported.\n"); + } + KokkosSparse::SPMVHandle handle( + algo); + // Pull out any expert tuning parameters + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + handle.force_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + handle.force_static_schedule = true; + } + } + if (controls.isParameter("team size")) + handle.team_size = std::stoi(controls.getParameter("team size")); + if (controls.isParameter("vector length")) + handle.vector_length = std::stoi(controls.getParameter("vector length")); + if (controls.isParameter("rows per thread")) + handle.rows_per_thread = + std::stoll(controls.getParameter("rows per thread")); + spmv(space, &handle, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE&) { + spmv(space, controls, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE&) { + spmv(controls, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO&) { + spmv(space, controls, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO&) { + spmv(controls, mode, alpha, A, x, beta, y); +} + +} // namespace KokkosSparse + +#endif diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp new file mode 100644 index 0000000000..31e21481af --- /dev/null +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -0,0 +1,393 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_HANDLE_HPP_ +#define KOKKOSSPARSE_SPMV_HANDLE_HPP_ + +#include +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_BsrMatrix.hpp" +// Use TPL utilities for safely finalizing matrix descriptors, etc. +#include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosSparse_Utils_rocsparse.hpp" +#include "KokkosSparse_Utils_mkl.hpp" + +namespace KokkosSparse { + +/// SPMVAlgorithm values can be used to select different algorithms/methods for +/// performing SpMV computations. +enum SPMVAlgorithm { + SPMV_DEFAULT, /// Default algorithm: best overall performance for repeated + /// applications of SpMV. + SPMV_FAST_SETUP, /// Best performance in the non-reuse case, where the handle + /// is only used once. + SPMV_NATIVE, /// Use the best KokkosKernels implementation, even if a TPL + /// implementation is available. + SPMV_MERGE_PATH, /// Use load-balancing merge path algorithm (for CrsMatrix + /// only) + SPMV_BSR_V41, /// Use experimental version 4.1 algorithm (for BsrMatrix only) + SPMV_BSR_V42, /// Use experimental version 4.2 algorithm (for BsrMatrix only) + SPMV_BSR_TC /// Use experimental tensor core algorithm (for BsrMatrix only) +}; + +namespace Experimental { +/// Precision to use in the tensor core implementation of Bsr SpMV +enum class Bsr_TC_Precision { + Automatic, ///< Use Double, unless operations match mixed precision + Double, ///< fp64 += fp64 * fp64 + Mixed ///< fp32 += fp16 * fp16 +}; +} // namespace Experimental + +/// Get the name of a SPMVAlgorithm enum constant +inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) { + switch (a) { + case SPMV_DEFAULT: return "SPMV_DEFAULT"; + case SPMV_FAST_SETUP: return "SPMV_FAST_SETUP"; + case SPMV_NATIVE: return "SPMV_NATIVE"; + case SPMV_MERGE_PATH: return "SPMV_MERGE_PATH"; + case SPMV_BSR_V41: return "SPMV_BSR_V41"; + case SPMV_BSR_V42: return "SPMV_BSR_V42"; + case SPMV_BSR_TC: return "SPMV_BSR_TC"; + } + throw std::invalid_argument( + "SPMVHandle::get_algorithm_name: unknown algorithm"); + return ""; +} + +/// Return true if the given algorithm is always a native (KokkosKernels) +/// implementation, and false if it may be implemented by a TPL. +inline bool is_spmv_algorithm_native(SPMVAlgorithm a) { + switch (a) { + case SPMV_NATIVE: + case SPMV_MERGE_PATH: + case SPMV_BSR_V41: + case SPMV_BSR_V42: + case SPMV_BSR_TC: return true; + default: return false; + } +} + +namespace Impl { +// Execution spaces do not support operator== in public interface, even though +// in practice the major async/GPU spaces do have the feature. +// This is a conservative check for whether e1 and e2 are known to be the +// same. If it cannot be determined, assume they are different. +template +inline bool exec_spaces_same(const ExecutionSpace&, const ExecutionSpace&) { + return false; +} + +#ifdef KOKKOS_ENABLE_CUDA +template <> +inline bool exec_spaces_same(const Kokkos::Cuda& e1, + const Kokkos::Cuda& e2) { + return e1.impl_internal_space_instance() == e2.impl_internal_space_instance(); +} +#endif +#ifdef KOKKOS_ENABLE_HIP +template <> +inline bool exec_spaces_same(const Kokkos::HIP& e1, + const Kokkos::HIP& e2) { + return e1.impl_internal_space_instance() == e2.impl_internal_space_instance(); +} +#endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +inline bool exec_spaces_same( + const Kokkos::Experimental::SYCL& e1, + const Kokkos::Experimental::SYCL& e2) { + return e1.impl_internal_space_instance() == e2.impl_internal_space_instance(); +} +#endif + +template +struct TPL_SpMV_Data { + // Disallow default construction: must provide the initial execution space + TPL_SpMV_Data() = delete; + TPL_SpMV_Data(const ExecutionSpace& exec_) : exec(exec_) {} + void set_exec_space(const ExecutionSpace& new_exec) { + // Check if new_exec is different from (old) exec. + // If it is, fence the old exec now. + // That way, SPMVHandle cleanup doesn't need + // to worry about resources still being in use on the old exec. + if (!exec_spaces_same(exec, new_exec)) { + exec.fence(); + exec = new_exec; + } + } + virtual ~TPL_SpMV_Data() {} + ExecutionSpace exec; +}; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) +// Data used by cuSPARSE >=10.3 for both single-vector (SpMV) and multi-vector +// (SpMM). +// TODO: in future, this can also be used for BSR (cuSPARSE >=12.2) +struct CuSparse10_SpMV_Data : public TPL_SpMV_Data { + CuSparse10_SpMV_Data(const Kokkos::Cuda& exec_) : TPL_SpMV_Data(exec_) {} + ~CuSparse10_SpMV_Data() { + // Prefer cudaFreeAsync on the stream that last executed a spmv, but + // async memory management was introduced in 11.2 +#if (CUDA_VERSION >= 11020) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(buffer, exec.cuda_stream())); +#else + // Fence here to ensure spmv is not still using buffer + // (cudaFree does not do a device synchronize) + exec.fence(); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer)); +#endif + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(mat)); + } + + cusparseSpMatDescr_t mat; + size_t bufferSize = 0; + void* buffer = nullptr; +}; +#endif + +// Data used by cuSPARSE <10.3 for CRS, and >=9 for BSR +struct CuSparse9_SpMV_Data : public TPL_SpMV_Data { + CuSparse9_SpMV_Data(const Kokkos::Cuda& exec_) : TPL_SpMV_Data(exec_) {} + ~CuSparse9_SpMV_Data() { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(mat)); + } + + cusparseMatDescr_t mat; +}; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +struct RocSparse_CRS_SpMV_Data : public TPL_SpMV_Data { + RocSparse_CRS_SpMV_Data(const Kokkos::HIP& exec_) : TPL_SpMV_Data(exec_) {} + ~RocSparse_CRS_SpMV_Data() { + // note: hipFree includes an implicit device synchronize + KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(buffer)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_spmat_descr(mat)); + } + + rocsparse_spmat_descr mat; + size_t bufferSize = 0; + void* buffer = nullptr; +}; + +struct RocSparse_BSR_SpMV_Data : public TPL_SpMV_Data { + RocSparse_BSR_SpMV_Data(const Kokkos::HIP& exec_) : TPL_SpMV_Data(exec_) {} + ~RocSparse_BSR_SpMV_Data() { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_descr(mat)); +#if (KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400) + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_info(info)); +#endif + } + + rocsparse_mat_descr mat; +#if (KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400) + rocsparse_mat_info info; +#endif +}; +#endif + +// note: header defining __INTEL_MKL__ is pulled in above by Utils_mkl.hpp +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#if (__INTEL_MKL__ > 2017) +// Data for classic MKL (both CRS and BSR) +template +struct MKL_SpMV_Data : public TPL_SpMV_Data { + MKL_SpMV_Data(const ExecutionSpace& exec_) + : TPL_SpMV_Data(exec_) {} + ~MKL_SpMV_Data() { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mat)); + // descr is just a plain-old-data struct, no cleanup to do + } + + sparse_matrix_t mat; + matrix_descr descr; +}; +#endif + +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +struct OneMKL_SpMV_Data : public TPL_SpMV_Data { + OneMKL_SpMV_Data(const Kokkos::Experimental::SYCL& exec_) + : TPL_SpMV_Data(exec_) {} + ~OneMKL_SpMV_Data() { + // Make sure no spmv is still running with this handle, if exec uses an + // out-of-order queue (rare case) + if (!exec.sycl_queue().is_in_order()) exec.fence(); +#if INTEL_MKL_VERSION >= 20230200 + // MKL 2023.2 and up make this async release okay even though it takes a + // pointer to mat, which is going out of scope after this destructor + oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &mat); +#else + // But in older versions, wait on ev_release before letting mat go out of + // scope + auto ev_release = + oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &mat); + ev_release.wait(); +#endif + } + + oneapi::mkl::sparse::matrix_handle_t mat; +}; +#endif +#endif + +template +struct SPMVHandleImpl { + using ExecutionSpaceType = ExecutionSpace; + // Do not allow const qualifier on Scalar, Ordinal, Offset (otherwise this + // type won't match the ETI'd type). Users should not use SPMVHandleImpl + // directly and SPMVHandle explicitly removes const, so this should never + // happen in practice. + static_assert(!std::is_const_v, + "SPMVHandleImpl: Scalar must not be a const type"); + static_assert(!std::is_const_v, + "SPMVHandleImpl: Offset must not be a const type"); + static_assert(!std::is_const_v, + "SPMVHandleImpl: Ordinal must not be a const type"); + SPMVHandleImpl(SPMVAlgorithm algo_) : algo(algo_) {} + ~SPMVHandleImpl() { + if (tpl) delete tpl; + } + void set_exec_space(const ExecutionSpace& exec) { + if (tpl) tpl->set_exec_space(exec); + } + bool is_set_up = false; + const SPMVAlgorithm algo = SPMV_DEFAULT; + TPL_SpMV_Data* tpl = nullptr; + // Expert tuning parameters for native SpMV + // TODO: expose a proper Experimental interface to set these. Currently they + // can be assigned directly in the SPMVHandle as they are public members. + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + bool force_static_schedule = false; + bool force_dynamic_schedule = false; + KokkosSparse::Experimental::Bsr_TC_Precision bsr_tc_precision = + KokkosSparse::Experimental::Bsr_TC_Precision::Automatic; +}; +} // namespace Impl + +// clang-format off +/// \class SPMVHandle +/// \brief Opaque handle type for KokkosSparse::spmv. It passes the choice of +/// algorithm to the spmv implementation, and also may store internal data which can be used to +/// speed up the spmv computation. +/// \tparam DeviceType A Kokkos::Device or execution space where the spmv computation will be run. +/// Does not necessarily need to match AMatrix's device type, but its execution space needs to be able +/// to access the memory spaces of AMatrix, XVector and YVector. +/// \tparam AMatrix A specialization of KokkosSparse::CrsMatrix or +/// KokkosSparse::BsrMatrix. +/// +/// SPMVHandle's internal resources are lazily allocated and initialized by the first +/// spmv call. +/// +/// SPMVHandle automatically cleans up all allocated resources when it is destructed. +/// No fencing by the user is required between the final spmv and cleanup. +/// +/// A SPMVHandle instance can be used in any number of calls, with any execution space +/// instance and any X/Y vectors (with matching types) each call. +/// +/// \warning However, all calls to spmv with a given instance of SPMVHandle must use the +/// same matrix. +// clang-format on + +template +struct SPMVHandle + : public Impl::SPMVHandleImpl { + using ImplType = + Impl::SPMVHandleImpl; + // Note: these typedef names cannot shadow template parameters + using AMatrixType = AMatrix; + using XVectorType = XVector; + using YVectorType = YVector; + using ExecutionSpaceType = typename DeviceType::execution_space; + // Check all template parameters for compatibility with each other + // NOTE: we do not require that ExecutionSpace matches + // AMatrix::execution_space. For example, if the matrix's device is it is allowed to run spmv on Serial. + static_assert(is_crs_matrix_v || + Experimental::is_bsr_matrix_v, + "SPMVHandle: AMatrix must be a specialization of CrsMatrix or " + "BsrMatrix."); + static_assert(Kokkos::is_view::value, + "SPMVHandle: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "SPMVHandle: YVector must be a Kokkos::View."); + static_assert(XVector::rank() == YVector::rank(), + "SPMVHandle: ranks of XVector and YVector must match."); + static_assert( + XVector::rank() == size_t(1) || YVector::rank() == size_t(2), + "SPMVHandle: XVector and YVector must be both rank-1 or both rank-2."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "SPMVHandle: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "SPMVHandle: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "SPMVHandle: YVector must be accessible from ExecutionSpace"); + + // Prevent copying (this object does not support reference counting) + SPMVHandle(const SPMVHandle&) = delete; + SPMVHandle& operator=(const SPMVHandle&) = delete; + + /// \brief Create a new SPMVHandle using the given algorithm. + SPMVHandle(SPMVAlgorithm algo_ = SPMV_DEFAULT) : ImplType(algo_) { + // Validate the choice of algorithm based on A's type + if constexpr (is_crs_matrix_v) { + switch (get_algorithm()) { + case SPMV_BSR_V41: + case SPMV_BSR_V42: + case SPMV_BSR_TC: + throw std::invalid_argument(std::string("SPMVHandle: algorithm ") + + get_spmv_algorithm_name(get_algorithm()) + + " cannot be used if A is a CrsMatrix"); + default:; + } + } else { + switch (get_algorithm()) { + case SPMV_MERGE_PATH: + throw std::invalid_argument(std::string("SPMVHandle: algorithm ") + + get_spmv_algorithm_name(get_algorithm()) + + " cannot be used if A is a BsrMatrix"); + default:; + } + } + } + + /// Get the SPMVAlgorithm used by this handle + SPMVAlgorithm get_algorithm() const { return this->algo; } +}; + +} // namespace KokkosSparse + +#endif diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 07bb0a0f0a..3a68ba348e 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -25,7 +25,8 @@ namespace KokkosSparse { namespace Experimental { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_bsrmatrix_tpl_spec_avail { enum : bool { value = false }; }; @@ -41,6 +42,8 @@ struct spmv_bsrmatrix_tpl_spec_avail { template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -127,22 +130,24 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_bsrmatrix_tpl_spec_avail< \ - EXECSPACE, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - const SCALAR, const MKL_INT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -166,7 +171,8 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #endif // Specialization struct which defines whether a specialization exists -template > struct spmv_mv_bsrmatrix_tpl_spec_avail { @@ -184,6 +190,8 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { template <> \ struct spmv_mv_bsrmatrix_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -231,23 +239,25 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - EXECSPACE, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - const SCALAR, const int, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const int, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -279,6 +289,8 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 75752190e7..875913214c 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -18,228 +18,225 @@ #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #include "KokkosKernels_AlwaysFalse.hpp" -#include "KokkosKernels_Controls.hpp" #include "KokkosSparse_Utils_mkl.hpp" #include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosKernels_tpl_handles_decl.hpp" -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && (__INTEL_MKL__ > 2017) #include namespace KokkosSparse { namespace Experimental { namespace Impl { -#if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() using KokkosSparse::Impl::mode_kk_to_mkl; -inline matrix_descr getDescription() { - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - return A_descr; -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, - MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, float* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const double* Avalues, - const double* x, double* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, - Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); - - MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, - Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); - - matrix_descr A_descr = getDescription(); - MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); -} - -inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, - float beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, MKL_INT colx, MKL_INT ldx, - float* y, MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, - ldx, beta, y, ldy)); -} - -inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const double* Avalues, const double* x, - MKL_INT colx, MKL_INT ldx, double* y, - MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, - ldx, beta, y, ldy)); -} - -inline void spm_mv_block_impl_mkl( - sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, const Kokkos::complex* x, - MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); - - MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, - reinterpret_cast(x), colx, ldx, - beta_mkl, reinterpret_cast(y), ldy)); +// Note: Scalar here is the Kokkos type, not the MKL type +template +inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, + Scalar beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Scalar* Avalues, const Scalar* x, Scalar* y) { + using MKLScalar = + typename KokkosSparse::Impl::KokkosToMKLScalar::type; + using ExecSpace = typename Handle::ExecutionSpaceType; + using Subhandle = KokkosSparse::Impl::MKL_SpMV_Data; + Subhandle* subhandle; + const MKLScalar* x_mkl = reinterpret_cast(x); + MKLScalar* y_mkl = reinterpret_cast(y); + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + } else { + // Use the default execution space instance, as classic MKL does not use + // a specific instance. + subhandle = new Subhandle(ExecSpace()); + handle->tpl = subhandle; + subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; + subhandle->descr.mode = SPARSE_FILL_MODE_FULL; + subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; + // Note: the create_csr routine requires non-const values even though + // they're not actually modified + MKLScalar* Avalues_mkl = + reinterpret_cast(const_cast(Avalues)); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } + handle->is_set_up = true; + } + MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar(alpha); + MKLScalar beta_mkl = KokkosSparse::Impl::KokkosToMKLScalar(beta); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } } -inline void spm_mv_block_impl_mkl( - sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, const Kokkos::complex* x, - MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); - - matrix_descr A_descr = getDescription(); - MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, - reinterpret_cast(x), colx, ldx, - beta_mkl, reinterpret_cast(y), ldy)); +// Note: Scalar here is the Kokkos type, not the MKL type +template +inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, + Scalar beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Scalar* Avalues, const Scalar* x, + MKL_INT colx, MKL_INT ldx, Scalar* y, MKL_INT ldy) { + using MKLScalar = + typename KokkosSparse::Impl::KokkosToMKLScalar::type; + using ExecSpace = typename Handle::ExecutionSpaceType; + using Subhandle = KokkosSparse::Impl::MKL_SpMV_Data; + Subhandle* subhandle; + const MKLScalar* x_mkl = reinterpret_cast(x); + MKLScalar* y_mkl = reinterpret_cast(y); + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + } else { + // Use the default execution space instance, as classic MKL does not use + // a specific instance. + subhandle = new Subhandle(ExecSpace()); + handle->tpl = subhandle; + subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; + subhandle->descr.mode = SPARSE_FILL_MODE_FULL; + subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; + // Note: the create_csr routine requires non-const values even though + // they're not actually modified + MKLScalar* Avalues_mkl = + reinterpret_cast(const_cast(Avalues)); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } + handle->is_set_up = true; + } + MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar(alpha); + MKLScalar beta_mkl = KokkosSparse::Impl::KokkosToMKLScalar(beta); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } } -#endif - -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV_BSRMATRIX< \ - EXECSPACE, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const>, \ - Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, MKL_INT const, device_type, \ - Kokkos::MemoryTraits, MKL_INT const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_bsrmatrix( \ - const EXECSPACE&, \ - const KokkosKernels::Experimental::Controls& /*controls*/, \ - const char mode[], const coefficient_type& alpha, const AMatrix& A, \ - const XVector& X, const coefficient_type& beta, const YVector& Y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ - A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), X.data(), \ - Y.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix(const EXECSPACE&, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& X, \ + const coefficient_type& beta, \ + const YVector& Y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_bsr_mkl(handle, mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.blockDim(), A.graph.row_map.data(), \ + A.graph.entries.data(), A.values.data(), X.data(), \ + Y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -268,6 +265,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, template <> \ struct SPMV_MV_BSRMATRIX< \ EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ SCALAR const, MKL_INT const, \ Kokkos::Device, \ @@ -281,9 +280,12 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits>, \ true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, MKL_INT const, device_type, \ - Kokkos::MemoryTraits, MKL_INT const>; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -291,21 +293,22 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits>; \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_mv_bsrmatrix( \ - const EXECSPACE&, \ - const KokkosKernels::Experimental::Controls& /*controls*/, \ - const char mode[], const coefficient_type& alpha, const AMatrix& A, \ - const XVector& X, const coefficient_type& beta, const YVector& Y) { \ + static void spmv_mv_bsrmatrix(const EXECSPACE&, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& X, \ + const coefficient_type& beta, \ + const YVector& Y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ MKL_INT colx = static_cast(X.extent(1)); \ MKL_INT ldx = static_cast(X.stride_1()); \ MKL_INT ldy = static_cast(Y.stride_1()); \ - spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ - A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), X.data(), \ - colx, ldx, Y.data(), ldy); \ + spmv_mv_bsr_mkl(handle, mode_kk_to_mkl(mode[0]), alpha, beta, \ + A.numRows(), A.numCols(), A.blockDim(), \ + A.graph.row_map.data(), A.graph.entries.data(), \ + A.values.data(), X.data(), colx, ldx, Y.data(), ldy); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -338,12 +341,11 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, } // namespace Experimental } // namespace KokkosSparse -#endif // KOKKOSKERNELS_ENABLE_TPL_MKL +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && (__INTEL_MKL__ > 2017) // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include "cusparse.h" -#include "KokkosSparse_Utils_cusparse.hpp" // // From https://docs.nvidia.com/cuda/cusparse/index.html#bsrmv @@ -352,23 +354,28 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, // - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported // - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported. // +#if (9000 <= CUDA_VERSION) + +#include "KokkosSparse_Utils_cusparse.hpp" + namespace KokkosSparse { namespace Experimental { namespace Impl { -template -void spmv_block_impl_cusparse( - const Kokkos::Cuda& exec, - const KokkosKernels::Experimental::Controls& controls, const char mode[], - typename YVector::non_const_value_type const& alpha, const AMatrix& A, - const XVector& x, typename YVector::non_const_value_type const& beta, - const YVector& y) { +template +void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, + const char mode[], + typename YVector::non_const_value_type const& alpha, + const AMatrix& A, const XVector& x, + typename YVector::non_const_value_type const& beta, + const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using entry_type = typename AMatrix::non_const_ordinal_type; using value_type = typename AMatrix::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); @@ -382,70 +389,75 @@ void spmv_block_impl_cusparse( } } -#if (9000 <= CUDA_VERSION) + KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; + + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + /* create and set the subhandle and matrix descriptor */ + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl = subhandle; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); + handle->is_set_up = true; + } - /* create and set the matrix descriptor */ - cusparseMatDescr_t descrA = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; /* perform the actual SpMV operation */ - if ((std::is_same::value) && - (std::is_same::value)) { - if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else { - throw std::logic_error( - "Trying to call cusparse[*]bsrmv with a scalar type not " - "float/double, " - "nor complex of either!"); - } + static_assert( + std::is_same_v && std::is_same_v, + "With cuSPARSE non-generic API, offset and entry types must both be int. " + "Something wrong with TPL avail logic."); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), A.graph.row_map.data(), + A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), + subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); } else { - throw std::logic_error( - "With cuSPARSE pre-10.0, offset and entry types must be int. " - "Something wrong with TPL avail logic."); + static_assert(KokkosKernels::Impl::always_false_v, + "Trying to call cusparse[*]bsrmv with a scalar type not " + "float/double, nor complex of either!"); } - - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // (9000 <= CUDA_VERSION) } // Reference @@ -463,27 +475,22 @@ void spmv_block_impl_cusparse( // -> C = t(t(B)) * t(A) + C // -> C = B * t(A) + C // This is impossible in cuSparse without explicitly transposing A, -// so we just do not support LayoutRight in cuSparse TPL now -// -template < - class AMatrix, class XVector, class YVector, - std::enable_if_t::value && - std::is_same::value, - bool> = true> -void spm_mv_block_impl_cusparse( - const Kokkos::Cuda& exec, - const KokkosKernels::Experimental::Controls& controls, const char mode[], - typename YVector::non_const_value_type const& alpha, const AMatrix& A, - const XVector& x, typename YVector::non_const_value_type const& beta, - const YVector& y) { +// so we just do not support LayoutRight in cuSparse TPL now (this is +// statically asserted here) +template +void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, + const char mode[], + typename YVector::non_const_value_type const& alpha, + const AMatrix& A, const XVector& x, + typename YVector::non_const_value_type const& beta, + const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using entry_type = typename AMatrix::non_const_ordinal_type; using value_type = typename AMatrix::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); @@ -499,123 +506,136 @@ void spm_mv_block_impl_cusparse( int colx = static_cast(x.extent(1)); - // ldx and ldy should be the leading dimension of X,Y respectively - const int ldx = static_cast(x.extent(0)); - const int ldy = static_cast(y.extent(0)); + // ldx and ldy should be the leading dimension (stride between columns) of X,Y + // respectively + const int ldx = static_cast(x.stride(1)); + const int ldy = static_cast(y.stride(1)); -#if (9000 <= CUDA_VERSION) - - /* create and set the matrix descriptor */ - cusparseMatDescr_t descrA = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + static_assert( + std::is_same_v && + std::is_same_v, + "cuSPARSE requires both X and Y to be LayoutLeft."); + + KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; + + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + /* create and set the subhandle and matrix descriptor */ + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl = subhandle; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); + handle->is_set_up = true; + } cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; /* perform the actual SpMV operation */ - if ((std::is_same::value) && - (std::is_same::value)) { - if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else { - throw std::logic_error( - "Trying to call cusparse[*]bsrmm with a scalar type not " - "float/double, " - "nor complex of either!"); - } + static_assert( + std::is_same_v && std::is_same_v, + "With cuSPARSE non-generic API, offset and entry types must both be int. " + "Something wrong with TPL avail logic."); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), A.graph.row_map.data(), + A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), + subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); } else { - throw std::logic_error( - "With cuSPARSE pre-10.0, offset and entry types must be int. " - "Something wrong with TPL avail logic."); + static_assert(KokkosKernels::Impl::always_false_v, + "Trying to call cusparse[*]bsrmm with a scalar type not " + "float/double, nor complex of either!"); } - - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // (9000 <= CUDA_VERSION) } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV_BSRMATRIX< \ - Kokkos::Cuda, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const>, \ - Kokkos::View< \ - SCALAR const*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, device_type, memory_trait_type, \ - OFFSET const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_bsrmatrix(const Kokkos::Cuda& exec, \ - const Controls& controls, const char mode[], \ - const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix(const Kokkos::Cuda& exec, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_bsr_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -664,57 +684,59 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif // (9000 <= CUDA_VERSION) #undef KOKKOSSPARSE_SPMV_CUSPARSE // cuSparse TPL does not support LayoutRight for this operation // only specialize for LayoutLeft -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, SPACE, \ - ETI_AVAIL) \ - template <> \ - struct SPMV_MV_BSRMATRIX< \ - Kokkos::Cuda, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const>, \ - Kokkos::View< \ - SCALAR const**, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - false, true, ETI_AVAIL> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, device_type, memory_trait_type, \ - OFFSET const>; \ - using XVector = Kokkos::View< \ - SCALAR const**, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View \ + struct SPMV_MV_BSRMATRIX< \ + Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true, ETI_AVAIL> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ + using XVector = Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_mv_bsrmatrix(const Kokkos::Cuda& exec, \ - const Controls& controls, const char mode[], \ - const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spm_mv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_mv_bsrmatrix(const Kokkos::Cuda& exec, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mv_bsr_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, true) @@ -740,13 +762,12 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::CudaUVMSpace, false) -#endif // (9000 <= CUDA_VERSION) - #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE } // namespace Impl } // namespace Experimental } // namespace KokkosSparse +#endif // (9000 <= CUDA_VERSION) #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -763,13 +784,13 @@ namespace KokkosSparse { namespace Experimental { namespace Impl { -template -void spmv_block_impl_rocsparse( - const Kokkos::HIP& exec, - const KokkosKernels::Experimental::Controls& controls, const char mode[], - typename YVector::non_const_value_type const& alpha, const AMatrix& A, - const XVector& x, typename YVector::non_const_value_type const& beta, - const YVector& y) { +template +void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, + const char mode[], + typename YVector::non_const_value_type const& alpha, + const AMatrix& A, const XVector& x, + typename YVector::non_const_value_type const& beta, + const YVector& y) { /* rocm 5.4.0 rocsparse_*bsrmv reference: https://rocsparse.readthedocs.io/en/rocm-5.4.0/usermanual.html#rocsparse-bsrmv-ex @@ -818,9 +839,10 @@ void spmv_block_impl_rocsparse( Kokkos::LayoutStride>, "A entries must be contiguous"); - rocsparse_handle handle = controls.getRocsparseHandle(); + rocsparse_handle rocsparseHandle = + KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; // resets handle stream to NULL when out of scope - KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(handle, exec); + KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(rocsparseHandle, exec); // set the mode rocsparse_operation trans; @@ -864,45 +886,91 @@ void spmv_block_impl_rocsparse( reinterpret_cast(&beta); rocsparse_value_type* y_ = reinterpret_cast(y.data()); - rocsparse_mat_descr descr; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr)); - rocsparse_mat_info info; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info)); + KokkosSparse::Impl::RocSparse_BSR_SpMV_Data* subhandle; + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for rocsparse BSR"); + } else { + subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); + handle->tpl = subhandle; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_create_mat_descr(&subhandle->mat)); + // *_ex* functions deprecated in introduced in 6+ +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_create_mat_info(&subhandle->info)); + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } + // *_ex* functions introduced in 5.4.0 +#elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 + // No analysis step in the older versions +#else + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_create_mat_info(&subhandle->info)); + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } +#endif + handle->is_set_up = true; + } // *_ex* functions deprecated in introduced in 6+ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); @@ -911,72 +979,59 @@ void spmv_block_impl_rocsparse( #elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 if constexpr (std::is_same_v) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else if constexpr (std::is_same_v) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } #else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_sbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_dbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_cbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_zbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } #endif - rocsparse_destroy_mat_descr(descr); - rocsparse_destroy_mat_info(info); - -} // spmv_block_impl_rocsparse +} // spmv_bsr_rocsparse #define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ SCALAR const, ORDINAL const, Kokkos::Device, \ Kokkos::MemoryTraits, OFFSET const>, \ @@ -988,20 +1043,22 @@ void spmv_block_impl_rocsparse( true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, device_type, memory_trait_type, \ - OFFSET const>; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const*, LAYOUT, device_type, \ Kokkos::MemoryTraits>; \ using YVector = \ Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_bsrmatrix(const Kokkos::HIP& exec, \ - const Controls& controls, const char mode[], \ + static void spmv_bsrmatrix(const Kokkos::HIP& exec, Handle* handle, \ + const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -1009,7 +1066,7 @@ void spmv_block_impl_rocsparse( std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ + spmv_bsr_rocsparse(exec, handle, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 5e33df1fa3..0ef473c54a 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -21,7 +21,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template > struct spmv_mv_tpl_spec_avail { @@ -33,6 +34,8 @@ struct spmv_mv_tpl_spec_avail { template <> \ struct spmv_mv_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index f28e04e26b..1de91cdf27 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -18,8 +18,7 @@ #define KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ #include - -#include "KokkosKernels_Controls.hpp" +#include "KokkosKernels_tpl_handles_decl.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -90,9 +89,8 @@ cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { return descr; } -template -void spmv_mv_cusparse(const Kokkos::Cuda &exec, - const KokkosKernels::Experimental::Controls &controls, +template +void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, const char mode[], typename YVector::non_const_value_type const &alpha, const AMatrix &A, const XVector &x, @@ -110,10 +108,18 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, using y_value_type = typename YVector::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ TemporarySetCusparseStream(cusparseHandle, exec); + /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); + const cudaDataType aCusparseType = cuda_data_type_from(); + /* Set the operation mode */ cusparseOperation_t opA; switch (toupper(mode[0])) { @@ -127,21 +133,6 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, } } - /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ - const cusparseIndexType_t myCusparseOffsetType = - cusparse_index_type_t_from(); - const cusparseIndexType_t myCusparseEntryType = - cusparse_index_type_t_from(); - const cudaDataType aCusparseType = cuda_data_type_from(); - - /* create matrix */ - cusparseSpMatDescr_t A_cusparse; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( - &A_cusparse, A.numRows(), A.numCols(), A.nnz(), - (void *)A.graph.row_map.data(), (void *)A.graph.entries.data(), - (void *)A.values.data(), myCusparseOffsetType, myCusparseEntryType, - CUSPARSE_INDEX_BASE_ZERO, aCusparseType)); - /* create lhs and rhs NOTE: The descriptions always say vecX and vecY are column-major cusparse order. For CUSPARSE_VERSION 10301 this is the only supported ordering. if X @@ -160,9 +151,9 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, // CUSPARSE_MM_ALG_DEFAULT was deprecated in CUDA 11.0.1 / cuSPARSE 11.0.0 and // removed in CUDA 12.0.0 / cuSPARSE 12.0.0 #if CUSPARSE_VERSION < 11000 - const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; + cusparseSpMMAlg_t algo = CUSPARSE_MM_ALG_DEFAULT; #else - const cusparseSpMMAlg_t alg = CUSPARSE_SPMM_ALG_DEFAULT; + cusparseSpMMAlg_t algo = CUSPARSE_SPMM_ALG_DEFAULT; #endif // the precision of the SpMV @@ -181,21 +172,39 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, } } - size_t bufferSize = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM_bufferSize( - cusparseHandle, opA, opB, &alpha, A_cusparse, vecX, &beta, vecY, - computeType, alg, &bufferSize)); + KokkosSparse::Impl::CuSparse10_SpMV_Data *subhandle; + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); + handle->tpl = subhandle; + /* create matrix */ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), + (void *)A.graph.row_map.data(), (void *)A.graph.entries.data(), + (void *)A.values.data(), myCusparseOffsetType, myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, aCusparseType)); + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM_bufferSize( + cusparseHandle, opA, opB, &alpha, subhandle->mat, vecX, &beta, vecY, + computeType, algo, &subhandle->bufferSize)); + + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(&subhandle->buffer, subhandle->bufferSize)); + + handle->is_set_up = true; + } - void *dBuffer = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM(cusparseHandle, opA, opB, &alpha, - A_cusparse, vecX, &beta, vecY, - computeType, alg, dBuffer)); + subhandle->mat, vecX, &beta, vecY, + computeType, algo, subhandle->buffer)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecX)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecY)); - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); } #define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ @@ -203,6 +212,8 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, template <> \ struct SPMV_MV< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix< \ SCALAR const, ORDINAL const, Kokkos::Device, \ Kokkos::MemoryTraits, OFFSET const>, \ @@ -214,6 +225,9 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, false, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ using AMatrix = CrsMatrix; \ using XVector = Kokkos::View< \ @@ -224,15 +238,14 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, \ using coefficient_type = typename YVector::non_const_value_type; \ \ - using Controls = KokkosKernels::Experimental::Controls; \ - static void spmv_mv(const Kokkos::Cuda &exec, const Controls &controls, \ + static void spmv_mv(const Kokkos::Cuda &exec, Handle *handle, \ const char mode[], const coefficient_type &alpha, \ const AMatrix &A, const XVector &x, \ const coefficient_type &beta, const YVector &y) { \ std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_mv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ + spmv_mv_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 653ec94811..854c2f2b26 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -24,7 +24,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_tpl_spec_avail { enum : bool { value = false }; }; @@ -40,6 +41,8 @@ struct spmv_tpl_spec_avail { template <> \ struct spmv_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -187,6 +190,9 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, template <> \ struct spmv_tpl_spec_avail< \ Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -217,22 +223,24 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_tpl_spec_avail< \ - EXECSPACE, \ - KokkosSparse::CrsMatrix, \ - Kokkos::MemoryTraits, \ - const MKL_INT>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -251,45 +259,57 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ - template <> \ - struct spmv_tpl_spec_avail< \ - Kokkos::Experimental::SYCL, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const ORDINAL, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const ORDINAL>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const ORDINAL>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; +// intel-oneapi-mkl/2023.2.0: spmv with complex data types produce: +// oneapi::mkl::sparse::gemv: unimplemented functionality: currently does not +// support complex data types. +// TODO: Revisit with later versions and selectively enable this if it's +// working. + KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( float, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( double, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +/* KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +*/ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( float, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( double, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +/* KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +*/ #endif #endif // KOKKOSKERNELS_ENABLE_TPL_MKL diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 10d9f8f2ee..1c589b2330 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -18,8 +18,7 @@ #define KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ #include - -#include "KokkosKernels_Controls.hpp" +#include "KokkosKernels_tpl_handles_decl.hpp" // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -29,10 +28,8 @@ namespace KokkosSparse { namespace Impl { -template -void spmv_cusparse(const Kokkos::Cuda& exec, - const KokkosKernels::Experimental::Controls& controls, - const char mode[], +template +void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -41,7 +38,8 @@ void spmv_cusparse(const Kokkos::Cuda& exec, using value_type = typename AMatrix::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ TemporarySetCusparseStream(cusparseHandle, exec); @@ -70,11 +68,6 @@ void spmv_cusparse(const Kokkos::Cuda& exec, #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) using entry_type = typename AMatrix::non_const_ordinal_type; - /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ - const cusparseIndexType_t myCusparseOffsetType = - cusparse_index_type_t_from(); - const cusparseIndexType_t myCusparseEntryType = - cusparse_index_type_t_from(); cudaDataType myCudaDataType; if (std::is_same::value) @@ -90,13 +83,11 @@ void spmv_cusparse(const Kokkos::Cuda& exec, "Scalar (data) type of CrsMatrix isn't supported by cuSPARSE, yet TPL " "layer says it is"); - /* create matrix */ - cusparseSpMatDescr_t A_cusparse; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( - &A_cusparse, A.numRows(), A.numCols(), A.nnz(), - (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(), - (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType, - CUSPARSE_INDEX_BASE_ZERO, myCudaDataType)); + /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); /* create lhs and rhs */ cusparseDnVecDescr_t vecX, vecY; @@ -105,150 +96,170 @@ void spmv_cusparse(const Kokkos::Cuda& exec, KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType)); - size_t bufferSize = 0; - void* dBuffer = NULL; -#if CUSPARSE_VERSION >= 11400 - cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; -#else - cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; -#endif - if (controls.isParameter("algorithm")) { - const std::string algName = controls.getParameter("algorithm"); - if (algName == "default") + // use default cusparse algo for best performance #if CUSPARSE_VERSION >= 11400 - alg = CUSPARSE_SPMV_ALG_DEFAULT; + cusparseSpMVAlg_t algo = CUSPARSE_SPMV_ALG_DEFAULT; #else - alg = CUSPARSE_MV_ALG_DEFAULT; + cusparseSpMVAlg_t algo = CUSPARSE_MV_ALG_DEFAULT; #endif - else if (algName == "merge") -#if CUSPARSE_VERSION >= 11400 - alg = CUSPARSE_SPMV_CSR_ALG2; + + KokkosSparse::Impl::CuSparse10_SpMV_Data* subhandle; + + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); + handle->tpl = subhandle; + + /* create matrix */ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), + (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(), + (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, myCudaDataType)); + + /* size and allocate buffer */ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( + cusparseHandle, myCusparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, myCudaDataType, algo, &subhandle->bufferSize)); + // Async memory management introduced in CUDA 11.2 +#if (CUDA_VERSION >= 11020) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMallocAsync( + &subhandle->buffer, subhandle->bufferSize, exec.cuda_stream())); #else - alg = CUSPARSE_CSRMV_ALG2; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(&subhandle->buffer, subhandle->bufferSize)); #endif + handle->is_set_up = true; } - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( - cusparseHandle, myCusparseOperation, &alpha, A_cusparse, vecX, &beta, - vecY, myCudaDataType, alg, &bufferSize)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); /* perform SpMV */ - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV(cusparseHandle, myCusparseOperation, - &alpha, A_cusparse, vecX, &beta, vecY, - myCudaDataType, alg, dBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSpMV(cusparseHandle, myCusparseOperation, &alpha, subhandle->mat, + vecX, &beta, vecY, myCudaDataType, algo, subhandle->buffer)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecX)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecY)); - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); #elif (9000 <= CUDA_VERSION) - /* create and set the matrix descriptor */ - cusparseMatDescr_t descrA = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; - /* perform the actual SpMV operation */ - if (std::is_same::value) { - if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - - } else if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else { - throw std::logic_error( - "Trying to call cusparse SpMV with a scalar type not float/double, " - "nor complex of either!"); - } + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); } else { - throw std::logic_error( - "With cuSPARSE pre-10.0, offset type must be int. Something wrong with " - "TPL avail logic."); + /* create and set the subhandle and matrix descriptor */ + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl = subhandle; + cusparseMatDescr_t descrA = 0; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); + handle->is_set_up = true; } - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); + /* perform the actual SpMV operation */ + static_assert( + std::is_same_v, + "With cuSPARSE pre-10.0, offset type must be int. Something wrong with " + "TPL avail logic."); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), A.graph.row_map.data(), + A.graph.entries.data(), reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + + } else if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else { + static_assert( + static_assert(KokkosKernels::Impl::always_false_v, + "Trying to call cusparse SpMV with a scalar type not float/double, " + "nor complex of either!"); + } #endif // CUDA_VERSION } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - Kokkos::Cuda, \ - KokkosSparse::CrsMatrix< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const>, \ - Kokkos::View< \ - SCALAR const*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv(const Kokkos::Cuda& exec, const Controls& controls, \ - const char mode[], const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const Kokkos::Cuda& exec, Handle* handle, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -// BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate -// transpose modes. No version of cuSPARSE supports mode C (conjugate, non -// transpose). In those cases, fall back to KokkosKernels native spmv. - #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::CudaSpace, @@ -364,10 +375,8 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, namespace KokkosSparse { namespace Impl { -template -void spmv_rocsparse(const Kokkos::HIP& exec, - const KokkosKernels::Experimental::Controls& controls, - const char mode[], +template +void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -377,9 +386,10 @@ void spmv_rocsparse(const Kokkos::HIP& exec, using value_type = typename AMatrix::non_const_value_type; /* initialize rocsparse library */ - rocsparse_handle handle = controls.getRocsparseHandle(); + rocsparse_handle rocsparseHandle = + KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; /* Set rocsparse to use the given stream until this function exits */ - TemporarySetRocsparseStream(handle, exec); + TemporarySetRocsparseStream(rocsparseHandle, exec); /* Set the operation mode */ rocsparse_operation myRocsparseOperation = mode_kk_to_rocsparse(mode); @@ -391,24 +401,6 @@ void spmv_rocsparse(const Kokkos::HIP& exec, /* Set the scalar type */ rocsparse_datatype compute_type = rocsparse_compute_type(); - /* Create the rocsparse mat and csr descr */ - rocsparse_mat_descr Amat; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&Amat)); - rocsparse_spmat_descr Aspmat; - // We need to do some casting to void* - // Note that row_map is always a const view so const_cast is necessary, - // however entries and values may not be const so we need to check first. - void* csr_row_ptr = - static_cast(const_cast(A.graph.row_map.data())); - void* csr_col_ind = - static_cast(const_cast(A.graph.entries.data())); - void* csr_val = static_cast(const_cast(A.values.data())); - - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( - &Aspmat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, csr_col_ind, - csr_val, offset_index_type, entry_index_type, rocsparse_index_base_zero, - compute_type)); - /* Create rocsparse dense vectors for X and Y */ rocsparse_dnvec_descr vecX, vecY; void* x_data = static_cast( @@ -422,99 +414,134 @@ void spmv_rocsparse(const Kokkos::HIP& exec, &vecY, y.extent_int(0), y_data, rocsparse_compute_type())); - /* Actually perform the SpMV operation, first size buffer, then compute result - */ - size_t buffer_size = 0; - void* tmp_buffer = nullptr; rocsparse_spmv_alg alg = rocsparse_spmv_alg_default; - // Note, Dec 6th 2021 - lbv: - // rocSPARSE offers two diffrent algorithms for spmv - // 1. ocsparse_spmv_alg_csr_adaptive - // 2. rocsparse_spmv_alg_csr_stream - // it is unclear which one is the default algorithm - // or what both algorithms are intended for? - if (controls.isParameter("algorithm")) { - const std::string algName = controls.getParameter("algorithm"); - if (algName == "default") - alg = rocsparse_spmv_alg_default; - else if (algName == "merge") - alg = rocsparse_spmv_alg_csr_stream; + + KokkosSparse::Impl::RocSparse_CRS_SpMV_Data* subhandle; + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for rocsparse CRS"); + } else { + subhandle = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec); + handle->tpl = subhandle; + /* Create the rocsparse csr descr */ + // We need to do some casting to void* + // Note that row_map is always a const view so const_cast is necessary, + // however entries and values may not be const so we need to check first. + void* csr_row_ptr = + static_cast(const_cast(A.graph.row_map.data())); + void* csr_col_ind = + static_cast(const_cast(A.graph.entries.data())); + void* csr_val = + static_cast(const_cast(A.values.data())); + + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( + &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, + csr_col_ind, csr_val, offset_index_type, entry_index_type, + rocsparse_index_base_zero, compute_type)); + + /* Size and allocate buffer, and analyze the matrix */ + +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_buffer_size, + &subhandle->bufferSize, nullptr)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMalloc(&subhandle->buffer, subhandle->bufferSize)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, + &subhandle->bufferSize, subhandle->buffer)); +#elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_auto, + &subhandle->bufferSize, nullptr)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMalloc(&subhandle->buffer, subhandle->bufferSize)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, + &subhandle->bufferSize, subhandle->buffer)); +#else + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, &subhandle->bufferSize, nullptr)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMalloc(&subhandle->buffer, subhandle->bufferSize)); +#endif + handle->is_set_up = true; } + /* Perform the actual computation */ + #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, rocsparse_spmv_stage_buffer_size, - &buffer_size, tmp_buffer)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, rocsparse_spmv_stage_compute, - &buffer_size, tmp_buffer)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, + &subhandle->bufferSize, subhandle->buffer)); #elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( - handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, - compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( - handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, - compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, + &subhandle->bufferSize, subhandle->buffer)); #else KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, &buffer_size, tmp_buffer)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, &buffer_size, tmp_buffer)); + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, + subhandle->mat, vecX, &beta, vecY, compute_type, alg, + &subhandle->bufferSize, subhandle->buffer)); #endif - KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(tmp_buffer)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecY)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecX)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_spmat_descr(Aspmat)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_descr(Amat)); } -#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - Kokkos::HIP, \ - KokkosSparse::CrsMatrix, \ - Kokkos::MemoryTraits, \ - rocsparse_int const>, \ - Kokkos::View< \ - SCALAR const*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv(const Kokkos::HIP& exec, const Controls& controls, \ - const char mode[], const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + rocsparse_int const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = KokkosSparse::Impl::SPMVHandleImpl< \ + Kokkos::HIP, Kokkos::HIPSpace, SCALAR, rocsparse_int, rocsparse_int>; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const Kokkos::HIP& exec, Handle* handle, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_rocsparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutLeft, @@ -550,82 +577,77 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, - MKL_INT n, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, float* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - const_cast(Avalues))); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, - MKL_INT m, MKL_INT n, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const double* Avalues, - const double* x, double* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - const_cast(Avalues))); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - (MKL_Complex8*)Avalues)); - MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); -} - -inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - (MKL_Complex16*)Avalues)); - MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); +// Note: Scalar here is the Kokkos type, not the MKL type +template +inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, + Scalar beta, MKL_INT m, MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Scalar* Avalues, + const Scalar* x, Scalar* y) { + using MKLScalar = typename KokkosToMKLScalar::type; + using ExecSpace = typename Handle::ExecutionSpaceType; + using Subhandle = MKL_SpMV_Data; + Subhandle* subhandle; + const MKLScalar* x_mkl = reinterpret_cast(x); + MKLScalar* y_mkl = reinterpret_cast(y); + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for MKL CRS"); + } else { + // Use the default execution space instance, as classic MKL does not use + // a specific instance. + subhandle = new Subhandle(ExecSpace()); + handle->tpl = subhandle; + subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; + subhandle->descr.mode = SPARSE_FILL_MODE_FULL; + subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; + // Note: the create_csr routine requires non-const values even though + // they're not actually modified + MKLScalar* Avalues_mkl = + reinterpret_cast(const_cast(Avalues)); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } + handle->is_set_up = true; + } + MKLScalar alpha_mkl = KokkosToMKLScalar(alpha); + MKLScalar beta_mkl = KokkosToMKLScalar(beta); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } } // Note: classic MKL runs on Serial/OpenMP but can't use our execution space @@ -633,6 +655,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV, \ KokkosSparse::CrsMatrix< \ SCALAR const, MKL_INT const, \ Kokkos::Device, \ @@ -646,6 +670,9 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::MemoryTraits>, \ true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ using AMatrix = \ CrsMatrix, MKL_INT const>; \ @@ -655,17 +682,16 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, using YVector = Kokkos::View>; \ using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ \ - static void spmv(const EXECSPACE&, const Controls&, const char mode[], \ + static void spmv(const EXECSPACE&, Handle* handle, const char mode[], \ const coefficient_type& alpha, const AMatrix& A, \ const XVector& x, const coefficient_type& beta, \ const YVector& y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ - A.graph.row_map.data(), A.graph.entries.data(), \ + spmv_mkl(handle, mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.graph.row_map.data(), A.graph.entries.data(), \ A.values.data(), x.data(), y.data()); \ Kokkos::Profiling::popRegion(); \ } \ @@ -707,132 +733,103 @@ inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { "Invalid mode for oneMKL (should be one of N, T, H)"); } -template -struct spmv_onemkl_wrapper {}; - -template <> -struct spmv_onemkl_wrapper { - template - static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, - typename matrix_type::non_const_value_type const alpha, - const matrix_type& A, const xview_type& x, - typename matrix_type::non_const_value_type const beta, - const yview_type& y) { - using scalar_type = typename matrix_type::non_const_value_type; - using ordinal_type = typename matrix_type::non_const_ordinal_type; - - // oneAPI doesn't directly support mode H with real values, but this is - // equivalent to mode T - if (mkl_mode == oneapi::mkl::transpose::conjtrans && - !Kokkos::ArithTraits::isComplex) - mkl_mode = oneapi::mkl::transpose::trans; - - oneapi::mkl::sparse::matrix_handle_t handle = nullptr; - oneapi::mkl::sparse::init_matrix_handle(&handle); - auto ev_set = oneapi::mkl::sparse::set_csr_data( - exec.sycl_queue(), handle, A.numRows(), A.numCols(), +template +inline void spmv_onemkl(const execution_space& exec, Handle* handle, + oneapi::mkl::transpose mkl_mode, + typename matrix_type::non_const_value_type const alpha, + const matrix_type& A, const xview_type& x, + typename matrix_type::non_const_value_type const beta, + const yview_type& y) { + using scalar_type = typename matrix_type::non_const_value_type; + using onemkl_scalar_type = typename KokkosToOneMKLScalar::type; + using ordinal_type = typename matrix_type::non_const_ordinal_type; + + // oneAPI doesn't directly support mode H with real values, but this is + // equivalent to mode T + if (mkl_mode == oneapi::mkl::transpose::conjtrans && + !Kokkos::ArithTraits::isComplex) + mkl_mode = oneapi::mkl::transpose::trans; + + OneMKL_SpMV_Data* subhandle; + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for OneMKL CRS"); + } else { + subhandle = new OneMKL_SpMV_Data(exec); + handle->tpl = subhandle; + oneapi::mkl::sparse::init_matrix_handle(&subhandle->mat); + // Even for out-of-order SYCL queue, the inputs here do not depend on + // kernels being sequenced + auto ev = oneapi::mkl::sparse::set_csr_data( + exec.sycl_queue(), subhandle->mat, A.numRows(), A.numCols(), oneapi::mkl::index_base::zero, const_cast(A.graph.row_map.data()), const_cast(A.graph.entries.data()), - const_cast(A.values.data())); - auto ev_gemv = - oneapi::mkl::sparse::gemv(exec.sycl_queue(), mkl_mode, alpha, handle, - x.data(), beta, y.data(), {ev_set}); - // MKL 2023.2 and up make this release okay async even though it takes a - // pointer to a stack variable -#if INTEL_MKL_VERSION >= 20230200 - oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &handle, - {ev_gemv}); -#else - auto ev_release = oneapi::mkl::sparse::release_matrix_handle( - exec.sycl_queue(), &handle, {ev_gemv}); - ev_release.wait(); -#endif - } -}; - -template <> -struct spmv_onemkl_wrapper { - template - static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, - typename matrix_type::non_const_value_type const alpha, - const matrix_type& A, const xview_type& x, - typename matrix_type::non_const_value_type const beta, - const yview_type& y) { - using scalar_type = typename matrix_type::non_const_value_type; - using ordinal_type = typename matrix_type::non_const_ordinal_type; - using mag_type = typename Kokkos::ArithTraits::mag_type; - - oneapi::mkl::sparse::matrix_handle_t handle = nullptr; - oneapi::mkl::sparse::init_matrix_handle(&handle); - auto ev_set = oneapi::mkl::sparse::set_csr_data( - exec.sycl_queue(), handle, static_cast(A.numRows()), - static_cast(A.numCols()), oneapi::mkl::index_base::zero, - const_cast(A.graph.row_map.data()), - const_cast(A.graph.entries.data()), - reinterpret_cast*>( + reinterpret_cast( const_cast(A.values.data()))); - auto ev_gemv = oneapi::mkl::sparse::gemv( - exec.sycl_queue(), mkl_mode, alpha, handle, - reinterpret_cast*>( - const_cast(x.data())), - beta, reinterpret_cast*>(y.data()), {ev_set}); - // MKL 2023.2 and up make this release okay async even though it takes a - // pointer to a stack variable -#if INTEL_MKL_VERSION >= 20230200 - oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &handle, - {ev_gemv}); -#else - auto ev_release = oneapi::mkl::sparse::release_matrix_handle( - exec.sycl_queue(), &handle, {ev_gemv}); - ev_release.wait(); -#endif + // for out-of-order queue: the fence before gemv below will make sure + // optimize_gemv has finished + oneapi::mkl::sparse::optimize_gemv(exec.sycl_queue(), mkl_mode, + subhandle->mat, {ev}); + handle->is_set_up = true; } -}; - -#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - Kokkos::Experimental::SYCL, \ - KokkosSparse::CrsMatrix< \ - SCALAR const, ORDINAL const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, ORDINAL const>, \ - Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using execution_space = Kokkos::Experimental::SYCL; \ - using device_type = Kokkos::Device; \ - using AMatrix = \ - CrsMatrix, ORDINAL const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - static void spmv(const execution_space& exec, const Controls&, \ - const char mode[], const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ - spmv_onemkl_wrapper::is_complex>::spmv( \ - exec, mkl_mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ + + // Uncommon case: an out-of-order SYCL queue does not promise that previously + // enqueued kernels finish before starting this one. So fence exec to get the + // expected semantics. + if (!exec.sycl_queue().is_in_order()) exec.fence(); + oneapi::mkl::sparse::gemv( + exec.sycl_queue(), mkl_mode, alpha, subhandle->mat, + reinterpret_cast(x.data()), beta, + reinterpret_cast(y.data())); +} + +#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, ORDINAL const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using device_type = Kokkos::Device; \ + using Handle = KokkosSparse::Impl::SPMVHandleImpl< \ + Kokkos::Experimental::SYCL, MEMSPACE, SCALAR, ORDINAL, ORDINAL>; \ + using AMatrix = \ + CrsMatrix, ORDINAL const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const execution_space& exec, Handle* handle, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ + spmv_onemkl(exec, handle, mkl_mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, @@ -841,12 +838,14 @@ KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, KOKKOSSPARSE_SPMV_ONEMKL(double, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +/* KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +*/ KOKKOSSPARSE_SPMV_ONEMKL(float, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, @@ -854,12 +853,14 @@ KOKKOSSPARSE_SPMV_ONEMKL(float, std::int64_t, KOKKOSSPARSE_SPMV_ONEMKL(double, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +/* KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +*/ #endif } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 990fcc1a30..b377806928 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -24,7 +24,6 @@ #include #include -#include "KokkosKernels_Controls.hpp" #include "KokkosKernels_default_types.hpp" // #ifndef kokkos_complex_double @@ -180,10 +179,10 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, Kokkos::fence(); } -template +template void check_spmv( - const KokkosKernels::Experimental::Controls &controls, crsMat_t input_mat, - x_vector_type x, y_vector_type y, + handle_t *handle, crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, const std::string &mode, typename Kokkos::ArithTraits::mag_type @@ -208,7 +207,7 @@ void check_spmv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(controls, mode.data(), alpha, input_mat, x, beta, y); + KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -229,9 +228,10 @@ void check_spmv( EXPECT_TRUE(num_errors == 0); } -template +template void check_spmv_mv( - crsMat_t input_mat, x_vector_type x, y_vector_type y, + Handle *handle, crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, int numMV, @@ -259,7 +259,7 @@ void check_spmv_mv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(mode.data(), alpha, input_mat, x, beta, y); + KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -388,51 +388,6 @@ void check_spmv_mv_struct( } } // check_spmv_mv_struct -template -void check_spmv_controls( - KokkosKernels::Experimental::Controls controls, crsMat_t input_mat, - x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, - typename Kokkos::ArithTraits::mag_type - max_val) { - // typedef typename crsMat_t::StaticCrsGraphType graph_t; - using ExecSpace = typename crsMat_t::execution_space; - using my_exec_space = Kokkos::RangePolicy; - using y_value_type = typename y_vector_type::non_const_value_type; - using y_value_trait = Kokkos::ArithTraits; - using y_value_mag_type = typename y_value_trait::mag_type; - - // y is the quantity being tested here, - // so let us use y_value_type to determine - // the appropriate tolerance precision. - const y_value_mag_type eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; - const size_t nr = input_mat.numRows(); - y_vector_type expected_y("expected", nr); - Kokkos::deep_copy(expected_y, y); - Kokkos::fence(); - - sequential_spmv(input_mat, x, expected_y, alpha, beta); - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - controls.setParameter("algorithm", "merge"); - printf("requested merge based algorithm\n"); -#endif - - KokkosSparse::spmv(controls, "N", alpha, input_mat, x, beta, y); - int num_errors = 0; - Kokkos::parallel_reduce( - "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps, max_val), - num_errors); - if (num_errors > 0) - printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", - num_errors, y.extent_int(0), y_value_trait::abs(alpha), - y_value_trait::abs(beta)); - EXPECT_TRUE(num_errors == 0); -} // check_spmv_controls - } // namespace Test template @@ -452,15 +407,16 @@ Kokkos::complex randomUpperBound>(int mag) { template -void test_spmv(const KokkosKernels::Experimental::Controls &controls, - lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, bool heavy) { +void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, bool heavy) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; using mag_t = typename Kokkos::ArithTraits::mag_type; + using handle_t = + KokkosSparse::SPMVHandle; constexpr mag_t max_x = static_cast(1); constexpr mag_t max_y = static_cast(1); @@ -504,12 +460,17 @@ void test_spmv(const KokkosKernels::Experimental::Controls &controls, testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } + + // This handle can be reused for all following calls, since the matrix does + // not change + handle_t handle(algo); + for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(controls, input_mat, input_x, output_y, alpha, beta, + Test::check_spmv(&handle, input_mat, input_x, output_y, alpha, beta, mode, max_error); } } @@ -520,7 +481,7 @@ void test_spmv(const KokkosKernels::Experimental::Controls &controls, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(controls, input_mat, input_xt, output_yt, alpha, beta, + Test::check_spmv(&handle, input_mat, input_xt, output_yt, alpha, beta, mode, max_error); } } @@ -531,29 +492,10 @@ template void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy) { - { - KokkosKernels::Experimental::Controls controls; - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); - } - - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native"); - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); - } - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "merge"); - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); - } - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native-merge"); - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); + using namespace KokkosSparse; + for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_MERGE_PATH}) { + test_spmv(algo, numRows, nnz, bandwidth, + row_size_variance, heavy); } } @@ -573,6 +515,8 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, void, size_type>; using ViewTypeX = Kokkos::View; using ViewTypeY = Kokkos::View; + using handle_t = + KokkosSparse::SPMVHandle; ViewTypeX b_x("A", numRows, numMV); ViewTypeY b_y("B", numCols, numMV); @@ -613,13 +557,14 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } + handle_t handle; for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV, - mode, max_error); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, alpha, beta, + numMV, mode, max_error); } } } @@ -629,8 +574,8 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta, - numMV, mode, max_error); + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, alpha, + beta, numMV, mode, max_error); } } } @@ -654,6 +599,8 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, using ViewTypeX = Kokkos::View; using ViewTypeY = Kokkos::View; using mag_t = typename Kokkos::ArithTraits::mag_type; + using handle_t = + KokkosSparse::SPMVHandle; constexpr mag_t max_x = static_cast(10); constexpr mag_t max_y = static_cast(10); @@ -678,16 +625,18 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::deep_copy(b_y_copy, b_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "N", - max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "N", - max_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, "N", - max_y + max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "T", - max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "T", - max_y); + handle_t handle; + + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, + "N", max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, + "N", max_y); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, + "N", max_y + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, + "T", max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, + "T", max_y); // Testing all modes together, since matrix is square std::vector modes = {"N", "C", "T", "H"}; std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; @@ -696,8 +645,8 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv, - mode, max_error); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, alpha, + beta, nv, mode, max_error); } } } @@ -956,59 +905,6 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { output_y_copy, 1.0, 1.0, numMV, max_error); } -// check that the controls are flowing down correctly in the spmv kernel -template -void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, - const KokkosKernels::Experimental::Controls &controls = - KokkosKernels::Experimental::Controls()) { - using crsMat_t = typename KokkosSparse::CrsMatrix; - using scalar_view_t = typename crsMat_t::values_type::non_const_type; - using x_vector_type = scalar_view_t; - using y_vector_type = scalar_view_t; - using mag_t = typename Kokkos::ArithTraits::mag_type; - - constexpr mag_t max_x = static_cast(10); - constexpr mag_t max_y = static_cast(10); - constexpr mag_t max_val = static_cast(10); - - lno_t numCols = numRows; - - crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, row_size_variance, bandwidth); - lno_t nr = input_mat.numRows(); - lno_t nc = input_mat.numCols(); - - x_vector_type input_x("x", nc); - y_vector_type output_y("y", nr); - - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); - - Kokkos::fill_random(input_x, rand_pool, max_x); - Kokkos::fill_random(output_y, rand_pool, max_y); - Kokkos::fill_random(input_mat.values, rand_pool, max_val); - - const mag_t max_error = max_y + bandwidth * max_val * max_x; - - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0, - max_error); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0, - max_error); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0, - max_error); -} // test_spmv_controls - -// test the native algorithm -template -void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native"); - test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls); -} // test_spmv_native - // call it if ordinal int and, scalar float and double are instantiated. template void test_github_issue_101() { @@ -1177,6 +1073,10 @@ void test_spmv_all_interfaces_light() { using vector_t = Kokkos::View; using range1D_t = Kokkos::RangePolicy; using range2D_t = Kokkos::MDRangePolicy>; + using v_handle_t = + KokkosSparse::SPMVHandle; + using mv_handle_t = KokkosSparse::SPMVHandle; multivector_t x_mv("x_mv", n, 3); vector_t x("x", n); // Randomize x (it won't be modified after that) @@ -1216,41 +1116,24 @@ void test_spmv_all_interfaces_light() { space_partitions = Kokkos::Experimental::partition_space(space, 1, 1); space = space_partitions[1]; } - KokkosKernels::Experimental::Controls controls; - // All tagged versions - KokkosSparse::spmv(space, controls, "N", 1.0, A, x, 0.0, y, - KokkosSparse::RANK_ONE()); - space.fence(); - verify(); - clear_y(); - KokkosSparse::spmv(controls, "N", 1.0, A, x, 0.0, y, - KokkosSparse::RANK_ONE()); - verify(); - clear_y(); - KokkosSparse::spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv, - KokkosSparse::RANK_TWO()); - space.fence(); - verify_mv(); - clear_y(); - KokkosSparse::spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv, - KokkosSparse::RANK_TWO()); - verify_mv(); - clear_y(); - // Non-tagged versions - // space and controls - spmv(space, controls, "N", 1.0, A, x, 0.0, y); + + v_handle_t v_handle; + mv_handle_t mv_handle; + + // space and handle + spmv(space, &v_handle, "N", 1.0, A, x, 0.0, y); space.fence(); verify(); clear_y(); - spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv); + spmv(space, &mv_handle, "N", 1.0, A, x_mv, 0.0, y_mv); space.fence(); verify_mv(); clear_y(); - // controls - spmv(controls, "N", 1.0, A, x, 0.0, y); + // handle + spmv(&v_handle, "N", 1.0, A, x, 0.0, y); verify(); clear_y(); - spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv); + spmv(&mv_handle, "N", 1.0, A, x_mv, 0.0, y_mv); verify_mv(); clear_y(); // space @@ -1291,8 +1174,6 @@ void test_spmv_all_interfaces_light() { 100, 10, false); \ test_spmv_algorithms(10000, 10000 * 2, \ 100, 5, false); \ - test_spmv_controls(10000, 10000 * 20, \ - 100, 5); \ } #define EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index 5b823a22f7..6482d33d8a 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -40,7 +40,6 @@ #include #include #include -#include "KokkosKernels_Controls.hpp" #include "KokkosKernels_default_types.hpp" #include "KokkosSparse_spmv.hpp" @@ -53,29 +52,6 @@ using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; -/* Poor-man's std::optional since CUDA 11.0 seems to have an ICE - https://github.com/kokkos/kokkos-kernels/issues/1943 -*/ -struct OptCtrls { - bool present_; - KokkosKernels::Experimental::Controls ctrls_; - - OptCtrls() : present_(false) {} - OptCtrls(const KokkosKernels::Experimental::Controls &ctrls) - : present_(true), ctrls_(ctrls) {} - - operator bool() const { return present_; } - - constexpr const KokkosKernels::Experimental::Controls &operator*() - const &noexcept { - return ctrls_; - } - constexpr const KokkosKernels::Experimental::Controls *operator->() const - noexcept { - return &ctrls_; - } -}; - namespace Test_Spmv_Bsr { /*! \brief Maximum value used to fill A */ @@ -171,10 +147,10 @@ Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { /*! \brief test a specific spmv */ -template -void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, +template +void test_spmv(Handle *handle, const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { using scalar_type = typename Bsr::non_const_value_type; @@ -191,11 +167,7 @@ void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, YVector yAct("yAct", y.extent(0)); Kokkos::deep_copy(yAct, y); - if (controls) { - KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); - } else { - KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); - } + KokkosSparse::spmv(handle, mode, alpha, a, x, beta, yAct); // compare yExp and yAct auto hyExp = Kokkos::create_mirror_view(yExp); @@ -223,12 +195,8 @@ void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { - std::string alg; - if (controls) { - alg = controls->getParameter("algorithm", ""); - } else { - alg = ""; - } + std::string alg = + KokkosSparse::get_spmv_algorithm_name(handle->get_algorithm()); std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMV failure!" << std::endl; @@ -384,38 +352,43 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a) { template void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow) { + using namespace KokkosSparse; using scalar_type = typename Bsr::non_const_value_type; using execution_space = typename Bsr::execution_space; auto [x, y] = random_vecs_for_spmv(mode, a); - // cover a variety of controls - using Ctrls = KokkosKernels::Experimental::Controls; - std::vector ctrls = {OptCtrls(), // no controls - OptCtrls(Ctrls()), // empty controls - OptCtrls(Ctrls({{"algorithm", "tpl"}})), - OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; + using handle_t = SPMVHandle; + // cover a variety of algorithms + std::vector handles; + for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) + handles.push_back(new handle_t(algo)); + + // Tensor core algorithm temporarily disabled, fails on V100 + /* if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); + handles.push_back(new handle_t(SPMV_BSR_TC)); #if defined(KOKKOS_ARCH_AMPERE) - ctrls.push_back(OptCtrls(Ctrls( - {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); + // Also call SPMV_BSR_TC with Precision = Double on Ampere + handles.push_back(new handle_t(SPMV_BSR_TC)); + handles.back()->bsr_tc_precision = Experimental::Bsr_TC_Precision::Double; #endif // AMPERE #endif // AMPERE || VOLTA } #endif // CUDA } + */ - for (const auto &ctrl : ctrls) { + for (handle_t *handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spmv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } @@ -499,9 +472,9 @@ void test_spmv() { // Note: if mode_is_transpose(mode), then maxNnzPerRow is for A^T. Otherwise, // it's for A. -template -void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, +template +void test_spm_mv(Handle *handle, const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { using scalar_type = typename Bsr::non_const_value_type; @@ -518,11 +491,7 @@ void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, YVector yAct("yAct", y.extent(0), y.extent(1)); Kokkos::deep_copy(yAct, y); - if (controls) { - KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); - } else { - KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); - } + KokkosSparse::spmv(handle, mode, alpha, a, x, beta, yAct); // compare yExp and yAct auto hyExp = Kokkos::create_mirror_view(yExp); @@ -550,12 +519,8 @@ void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { - std::string alg; - if (controls) { - alg = controls->getParameter("algorithm", ""); - } else { - alg = ""; - } + std::string alg = + KokkosSparse::get_spmv_algorithm_name(handle->get_algorithm()); std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMMV failure!" << std::endl; @@ -621,38 +586,44 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, template void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow) { + using namespace KokkosSparse; using execution_space = typename Bsr::execution_space; using scalar_type = typename Bsr::non_const_value_type; + using multivector_t = typename MultiVectorTypeFor::type; + using handle_t = + SPMVHandle; - // cover a variety of controls - using Ctrls = KokkosKernels::Experimental::Controls; - std::vector ctrls = {OptCtrls(), // no controls - OptCtrls(Ctrls()), // empty controls - OptCtrls(Ctrls({{"algorithm", "tpl"}})), - OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; + // cover a variety of algorithms + std::vector handles; + for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) + handles.push_back(new handle_t(algo)); + // Tensor core algorithm temporarily disabled, fails on V100 + /* if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); + handles.push_back(new handle_t(SPMV_BSR_TC)); #if defined(KOKKOS_ARCH_AMPERE) - ctrls.push_back(OptCtrls(Ctrls( - {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); + // Also call SPMV_BSR_TC with Precision = Double on Ampere + handles.push_back(new handle_t(SPMV_BSR_TC)); + handles.back()->bsr_tc_precision = Experimental::Bsr_TC_Precision::Double; #endif // AMPERE #endif // AMPERE || VOLTA } #endif // CUDA } + */ for (size_t numVecs : {1, 7}) { // num multivecs auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); - for (const auto &ctrl : ctrls) { + for (handle_t *handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spm_mv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } From fe65db5e8702531217ba96ff4115705aea5c58d8 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 5 Mar 2024 12:53:02 -0700 Subject: [PATCH 177/326] Option to apply RCM reordering to extracted CRS diagonal blocks (#2125) * Add rcm option when extracting diagonal blocks * Update kk_extract_diagonal_blocks_crsmatrix_sequential * Add test for extracting diagonal blocks with rcm * Update RCM checking --- sparse/src/KokkosSparse_Utils.hpp | 131 +++++++++++++++--- .../Test_Sparse_extractCrsDiagonalBlocks.hpp | 45 +++++- 2 files changed, 151 insertions(+), 25 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index f3fbec1836..2b89c1a2f7 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -25,6 +25,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_BsrMatrix.hpp" #include "Kokkos_Bitset.hpp" +#include "KokkosGraph_RCM.hpp" #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT #include @@ -2415,15 +2416,23 @@ void kk_extract_subblock_crsmatrix_sequential( * @tparam crsMat_t The type of the CRS matrix. * @param A [in] The square CrsMatrix. It is expected that column indices are * in ascending order + * @param UseRCMReordering [in] Boolean indicating whether applying (true) RCM + * reordering to diagonal blocks or not (false) (default: false) * @param DiagBlk_v [out] The vector of the extracted the CRS diagonal blocks * (1 <= the number of diagonal blocks <= A_nrows) + * @return a vector of lists of vertices in RCM order (a list per a diagonal + * block) if UseRCMReordering is true, or an empty vector if UseRCMReordering is + * false * * Usage Example: - * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); + * perm = kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_out, + * UseRCMReordering); */ template -void kk_extract_diagonal_blocks_crsmatrix_sequential( - const crsMat_t &A, std::vector &DiagBlk_v) { +std::vector +kk_extract_diagonal_blocks_crsmatrix_sequential( + const crsMat_t &A, std::vector &DiagBlk_v, + bool UseRCMReordering = false) { using row_map_type = typename crsMat_t::row_map_type; using entries_type = typename crsMat_t::index_type; using values_type = typename crsMat_t::values_type; @@ -2437,6 +2446,7 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using ordinal_type = typename crsMat_t::non_const_ordinal_type; using size_type = typename crsMat_t::non_const_size_type; + using value_type = typename crsMat_t::non_const_value_type; using offset_view1d_type = Kokkos::View; @@ -2463,8 +2473,12 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( throw std::runtime_error(os.str()); } + std::vector perm_v; + std::vector perm_h_v; + if (n_blocks == 1) { // One block case: simply shallow copy A to DiagBlk_v[0] + // Note: always not applying RCM reordering, for now DiagBlk_v[0] = crsMat_t(A); } else { // n_blocks > 1 @@ -2487,12 +2501,10 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); - std::vector row_map_v(n_blocks); - std::vector entries_v(n_blocks); - std::vector values_v(n_blocks); - std::vector row_map_h_v(n_blocks); - std::vector entries_h_v(n_blocks); - std::vector values_h_v(n_blocks); + if (UseRCMReordering) { + perm_v.resize(n_blocks); + perm_h_v.resize(n_blocks); + } ordinal_type blk_row_start = 0; // first row index of i-th diagonal block ordinal_type blk_col_start = 0; // first col index of i-th diagonal block @@ -2509,37 +2521,110 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( // First round: count i-th non-zeros or size of entries_v[i] and find // the first and last column indices at each row size_type blk_nnz = 0; - offset_view1d_type first("first", blk_nrows); // first position per row - offset_view1d_type last("last", blk_nrows); // last position per row + offset_view1d_type first( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "first"), + blk_nrows); // first position per row + offset_view1d_type last( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "last"), + blk_nrows); // last position per row kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, blk_ncols, blk_nnz, first, last); // Second round: extract - row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); - entries_v[i] = out_entries_type("entries_v", blk_nnz); - values_v[i] = out_values_type("values_v", blk_nnz); - row_map_h_v[i] = - out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); - entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz); - values_h_v[i] = out_values_hostmirror_type("values_h_v", blk_nnz); + out_row_map_type row_map( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map"), + blk_nrows + 1); + out_entries_type entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), + blk_nnz); + out_values_type values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), blk_nnz); + out_row_map_hostmirror_type row_map_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_h"), + blk_nnz); + out_values_hostmirror_type values_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_h"), + blk_nnz); kk_extract_subblock_crsmatrix_sequential( A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, - last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); + last, row_map_h, entries_h, values_h); + + if (!UseRCMReordering) { + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + } else { + perm_h_v[i] = KokkosGraph::Experimental::graph_rcm< + Kokkos::DefaultHostExecutionSpace>(row_map_h, entries_h); + perm_v[i] = out_entries_type( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "perm_v"), + perm_h_v[i].extent(0)); + + out_row_map_hostmirror_type row_map_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_perm_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_perm_h"), + blk_nnz); + out_values_hostmirror_type values_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_perm_h"), + blk_nnz); + + out_entries_hostmirror_type reverseperm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "reverseperm_h"), + blk_nrows); + for (ordinal_type ii = 0; ii < blk_nrows; ii++) + reverseperm_h(perm_h_v[i](ii)) = ii; + + std::map colIdx_Value_rcm; + + // Loop through each row of the reordered matrix + size_type cnt = 0; + for (ordinal_type ii = 0; ii < blk_nrows; ii++) { + colIdx_Value_rcm.clear(); + // ii: reordered index + ordinal_type origRow = reverseperm_h( + ii); // get the original row idx of the reordered row idx, ii + for (size_type j = row_map_h(origRow); j < row_map_h(origRow + 1); + j++) { + ordinal_type origEi = entries_h(j); + value_type origV = values_h(j); + ordinal_type Ei = + perm_h_v[i](origEi); // get the reordered col idx of the + // original col idx, origEi + colIdx_Value_rcm[Ei] = origV; + } + row_map_perm_h(ii) = cnt; + for (typename std::map::iterator it = + colIdx_Value_rcm.begin(); + it != colIdx_Value_rcm.end(); ++it) { + entries_perm_h(cnt) = it->first; + values_perm_h(cnt) = it->second; + cnt++; + } + } + row_map_perm_h(blk_nrows) = cnt; - Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); - Kokkos::deep_copy(entries_v[i], entries_h_v[i]); - Kokkos::deep_copy(values_v[i], values_h_v[i]); + Kokkos::deep_copy(row_map, row_map_perm_h); + Kokkos::deep_copy(entries, entries_perm_h); + Kokkos::deep_copy(values, values_perm_h); + Kokkos::deep_copy(perm_v[i], perm_h_v[i]); + } DiagBlk_v[i] = crsMat_t("CrsMatrix", blk_nrows, blk_ncols, blk_nnz, - values_v[i], row_map_v[i], entries_v[i]); + values, row_map, entries); blk_row_start += blk_nrows; } // for (ordinal_type i = 0; i < n_blocks; i++) } // A_nrows >= 1 } // n_blocks > 1 + return perm_v; } } // namespace Impl diff --git a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp index 327780dec3..28674ad353 100644 --- a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp +++ b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -15,6 +15,8 @@ //@HEADER #include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosBlas1_nrm2.hpp" #include "KokkosKernels_TestUtils.hpp" namespace Test { @@ -31,6 +33,7 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { crsMat_t A; std::vector DiagBlks(nblocks); + std::vector DiagBlks_rcm(nblocks); if (nrows != 0) { // Generate test matrix @@ -84,6 +87,10 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, DiagBlks); + auto perm = + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential( + A, DiagBlks_rcm, true); + // Checking lno_t numRows = 0; lno_t numCols = 0; @@ -125,6 +132,40 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { col_start += DiagBlks[i].numCols(); } EXPECT_TRUE(flag); + + // Checking RCM + if (!perm.empty()) { + scalar_t one = scalar_t(1.0); + scalar_t zero = scalar_t(0.0); + scalar_t mone = scalar_t(-1.0); + for (int i = 0; i < nblocks; i++) { + ValuesType In("In", DiagBlks[i].numRows()); + ValuesType Out("Out", DiagBlks[i].numRows()); + + ValuesType_hm h_Out = Kokkos::create_mirror_view(Out); + ValuesType_hm h_Out_tmp = Kokkos::create_mirror(Out); + + Kokkos::deep_copy(In, one); + + auto h_perm = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), perm[i]); + + KokkosSparse::spmv("N", one, DiagBlks_rcm[i], In, zero, Out); + + Kokkos::deep_copy(h_Out_tmp, Out); + for (lno_t ii = 0; ii < static_cast(DiagBlks[i].numRows()); + ii++) { + lno_t rcm_ii = h_perm(ii); + h_Out(ii) = h_Out_tmp(rcm_ii); + } + Kokkos::deep_copy(Out, h_Out); + + KokkosSparse::spmv("N", one, DiagBlks[i], In, mone, Out); + + double nrm_val = KokkosBlas::nrm2(Out); + EXPECT_LE(nrm_val, 1e-9); + } + } } } } // namespace Test @@ -136,9 +177,9 @@ void test_extract_diagonal_blocks() { Test::run_test_extract_diagonal_blocks( 0, s); Test::run_test_extract_diagonal_blocks( - 12, s); + 153, s); Test::run_test_extract_diagonal_blocks( - 123, s); + 1553, s); } } From 9d27c1f436d7512ae480c26042e45fd5a86c2fa7 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 29 Feb 2024 20:56:32 -0700 Subject: [PATCH 178/326] cm_test_all_sandia: various updates - updates for blake --- scripts/cm_test_all_sandia | 120 +++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 46 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index fda38735a0..9f4e7eb88f 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -93,6 +93,9 @@ print_help() { echo " Valid items:" echo " blas, mkl, cublas, cusparse, cusolver, magma, armpl, rocblas, rocsparse, rocsolver" echo "" + echo "--cmake-flags=[CMAKE Command options]: Set Kokkos Kernels cmake options not handled by script" + echo "--kokkos-cmake-flags=[CMAKE Command options]: Set Kokkos cmake options not handled by script" + echo "" echo "ARGS: list of expressions matching compilers to test" echo " supported compilers sems" @@ -152,7 +155,6 @@ fi if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name MACHINE=blake - module load git fi if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name @@ -210,7 +212,6 @@ fi echo "Running on machine: $MACHINE" GCC_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" -IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" INTEL_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" CLANG_BUILD_LIST="Threads,Serial,Threads_Serial" @@ -218,7 +219,6 @@ CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Threads,Cuda_Serial" CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" -IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized,-diag-disable=1011,-diag-disable=869" CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" @@ -418,6 +418,12 @@ do --with-tpls*) KOKKOSKERNELS_ENABLE_TPLS="${key#*=}" ;; + --cmake-flags*) + PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; + --kokkos-cmake-flags*) + KOKKOS_PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; --help*) PRINT_HELP=True ;; @@ -732,53 +738,56 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then ARCH_FLAG="--arch=VEGA90A" fi elif [ "$MACHINE" = "blake" ]; then + MODULE_ENVIRONMENT="source /projects/x86-64-icelake-rocky8/spack-config/blake-setup-user-module-env.sh" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - module load cmake/3.19.3 - BASE_MODULE_LIST="cmake/3.19.3,/" - BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" - BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/,/oneAPI/hpc-toolkit/" - ONEAPI_WARNING_FLAGS="" + module load cmake - GCC102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21/gcc/10.2.0" + BASE_MODULE_LIST="cmake,/" + BASE_MODULE_LIST_TPLS="cmake,/,openblas/0.3.23" + BASE_MODULE_LIST_ONEAPI_202310="cmake,-oneapi-compilers/,intel-oneapi-dpl/2022.1.0,intel-oneapi-mkl/2023.1.0,intel-oneapi-tbb/2021.9.0" + BASE_MODULE_LIST_ONEAPI_202320="cmake,-oneapi-compilers/,intel-oneapi-dpl/2022.2.0,intel-oneapi-mkl/2023.2.0,intel-oneapi-tbb/2021.10.0" + ONEAPI_FLAGS_EXTRA="-fp-model=precise" + LLVM_EXTRA_FLAGS="-fPIC ${CLANG_WARNING_FLAGS}" + # Remove -Wuninitialized: compiler issues show up with Threads backend + GCC11_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered" + # update KOKKOS_PASSTHRU_CMAKE_FLAGS to disable onedpl on Blake + KOKKOS_PASSTHRU_CMAKE_FLAGS="${KOKKOS_PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_ONEDPL=OFF" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - # TODO: Failing toolchains: - #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" - "intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" + COMPILERS=("intel/2023.1.0 $BASE_MODULE_LIST_ONEAPI_202310 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "intel/2023.2.0 $BASE_MODULE_LIST_ONEAPI_202320 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "llvm/15.0.7 $BASE_MODULE_LIST "Threads,Serial" clang++ $LLVM_EXTRA_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" + "gcc/12.2.0 $BASE_MODULE_LIST "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" - "gcc/10.2.0 $GCC102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + # Known issues: + # gcc/12.2.0+openblas/0.3.23 with OpenMP: internal compiler error: in get_vectype_for_scalar_type, at tree-vect-stmts + COMPILERS=("intel/2023.1.0 $BASE_MODULE_LIST_ONEAPI_202310 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "intel/2023.2.0 $BASE_MODULE_LIST_ONEAPI_202320 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "llvm/15.0.7 $BASE_MODULE_LIST_TPLS "Threads,Serial" clang++ $LLVM_EXTRA_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST_TPLS "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" + "gcc/12.2.0 $BASE_MODULE_LIST_TPLS "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" ) else - COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/2021.2.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "intel/2021.4.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "intel/2022.1.2 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + # gcc/12.2.0 with OpenMP: internal compiler error: in get_vectype_for_scalar_type, at tree-vect-stmts + COMPILERS=("intel/2023.1.0 $BASE_MODULE_LIST_ONEAPI_202310 $INTEL_BUILD_LIST icpx $ONEAPI_FLAGS_EXTRA" + "intel/2023.2.0 $BASE_MODULE_LIST_ONEAPI_202320 $INTEL_BUILD_LIST icpx $ONEAPI_FLAGS_EXTRA" + "llvm/15.0.7 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $LLVM_EXTRA_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC11_WARNING_FLAGS" + "gcc/12.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC11_WARNING_FLAGS" ) fi if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SKX" + ARCH_FLAG="--arch=SPR" fi - SPACK_HOST_ARCH="+skx" + SPACK_HOST_ARCH="+spr" elif [ "$MACHINE" = "solo" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 @@ -793,8 +802,7 @@ elif [ "$MACHINE" = "solo" ]; then GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" if [ "$SPOT_CHECK" = "True" ]; then - COMPILERS=( - "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" "llvm/10.0.1 $BASE_MODULE_LIST_LLVM "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then @@ -803,8 +811,7 @@ elif [ "$MACHINE" = "solo" ]; then ) else ###"clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - COMPILERS=( - "gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" ) fi @@ -885,6 +892,7 @@ fi export OMP_NUM_THREADS=${omp_num_threads:=8} export OMP_PROC_BIND=${omp_proc_bind:=spread} export OMP_PLACES=${omp_places:=cores} +export KOKKOS_NUM_THREADS=8 declare -i NUM_RESULTS_TO_KEEP=7 @@ -948,6 +956,7 @@ if [ "$COMPILERS_TO_TEST" == "" ]; then exit 1 fi + # # Functions. # @@ -1121,10 +1130,9 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # Some machines will require explicitly setting include dirs and libs - if ([[ "$MACHINE" = weaver* ]] || [[ "$MACHINE" = blake* ]] || [[ "$MACHINE" = sogpu* ]]) && [[ "$mod" = openblas* ]]; then + if ([[ "$MACHINE" = weaver* ]] || [[ "$MACHINE" = sogpu* ]]) && [[ "$mod" = openblas* ]]; then BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" - # BLAS_LIBRARIES="openblas" BLAS_LIBRARIES="blas" LAPACK_LIBRARIES="lapack" KOKKOSKERNELS_TPL_PATH_CMD="--user-blas-path=${BLAS_LIBRARY_DIRS} --user-lapack-path=${LAPACK_LIBRARY_DIRS}" @@ -1132,6 +1140,16 @@ setup_env() { KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD="--extra-linker-flags=-lgfortran,-lm" echo "TPL PATHS: KOKKOSKERNELS_TPL_PATH_CMD=$KOKKOSKERNELS_TPL_PATH_CMD" echo "TPL LIBS: KOKKOSKERNELS_TPL_LIBS_CMD=$KOKKOSKERNELS_TPL_LIBS_CMD" + elif [[ "$MACHINE" = blake* ]] && [[ "$mod" = openblas* ]]; then + BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" + LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" + BLAS_LIBRARIES="openblas" + LAPACK_LIBRARIES="openblas" + KOKKOSKERNELS_TPL_PATH_CMD="--user-blas-path=${BLAS_LIBRARY_DIRS} --user-lapack-path=${LAPACK_LIBRARY_DIRS}" + KOKKOSKERNELS_TPL_LIBS_CMD="--user-blas-lib=${BLAS_LIBRARIES} --user-lapack-lib=${LAPACK_LIBRARIES}" + KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD="--extra-linker-flags=-lgfortran,-lm" + echo "TPL PATHS: KOKKOSKERNELS_TPL_PATH_CMD=$KOKKOSKERNELS_TPL_PATH_CMD" + echo "TPL LIBS: KOKKOSKERNELS_TPL_LIBS_CMD=$KOKKOSKERNELS_TPL_LIBS_CMD" elif ([[ "$MACHINE" = weaver* ]]) && [[ "$mod" = netlib* ]]; then BLAS_LIBRARY_DIRS="${BLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${BLAS_ROOT}/lib" @@ -1162,8 +1180,9 @@ single_build_and_test() { # Set up env. local compiler_modules_list=$(get_compiler_modules $compiler) - mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" - cd $ROOT_DIR/$compiler/"${build}-$build_type" + local BUILD_AND_TEST_DIR=$ROOT_DIR/$compiler/"${build}-$build_type" + mkdir -p $BUILD_AND_TEST_DIR + cd $BUILD_AND_TEST_DIR local kokkos_variants=$(get_kokkos_variants $compiler) local kernels_variants=$(get_kernels_variants $compiler) @@ -1206,6 +1225,7 @@ single_build_and_test() { echo " export OMP_NUM_THREADS=$omp_num_threads" &>> reload_modules.sh echo " export OMP_PROC_BIND=$omp_proc_bind" &>> reload_modules.sh echo " export OMP_PLACES=$omp_places" &>> reload_modules.sh + echo " export KOKKOS_NUM_THREADS=8" &>> reload_modules.sh echo "" &>> reload_modules.sh chmod +x reload_modules.sh @@ -1285,6 +1305,7 @@ single_build_and_test() { HIP_ENABLE_CMD="--with-hip" fi local arch_code=$(echo $ARCH_FLAG | cut -d "=" -f 2) + local tpl_list_print=$(echo $KOKKOSKERNELS_ENABLE_TPL_CMD | cut -d "=" -f2-) echo "kokkos devices: ${LOCAL_KOKKOS_DEVICES}" echo "kokkos arch: ${arch_code}" echo "kokkos options: ${KOKKOS_OPTIONS}" @@ -1295,16 +1316,17 @@ single_build_and_test() { echo "kokkoskernels ordinals: ${KOKKOSKERNELS_ORDINALS}" echo "kokkoskernels offsets: ${KOKKOSKERNELS_OFFSETS}" echo "kokkoskernels layouts: ${KOKKOSKERNELS_LAYOUTS}" + echo "kokkoskernels tpls list: ${tpl_list_print}" # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions echo " # Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh - echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &>> call_generate_makefile.sh + echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} --cmake-flags=${PASSTHRU_CMAKE_FLAGS} --kokkos-cmake-flags=${KOKKOS_PASSTHRU_CMAKE_FLAGS} $extra_args" &>> call_generate_makefile.sh chmod +x call_generate_makefile.sh # script command with generic path for faster copy/paste of reproducer into issues - echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &> call_generate_makefile_genericpath.sh + echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} --cmake-flags=${PASSTHRU_CMAKE_FLAGS} --kokkos-cmake-flags=${KOKKOS_PASSTHRU_CMAKE_FLAGS} $extra_args" &> call_generate_makefile_genericpath.sh - run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} --cmake-flags=${PASSTHRU_CMAKE_FLAGS} --kokkos-cmake-flags=${KOKKOS_PASSTHRU_CMAKE_FLAGS} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local -i build_start_time=$(date +%s) run_cmd make -j $MAKE_PAR_LEVEL all >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } @@ -1422,9 +1444,15 @@ wait_summarize_and_exit() { rv=$rv+1 local str=$failed_test - local comp=$(echo "$str" | cut -d- -f1) - local vers=$(echo "$str" | cut -d- -f2) - local lbuild=$(echo "$str" | cut -d- -f3-) + # Note: all relevant info in str to assemble the build directory path + # is separated by dashes; however the compiler name may include dashes as well + # the final two pieces of str always the version and build-type (as set in BUILD_AND_TEST_DIR) + # leaving the compiler name as the remaining fields preceding version + local getdashes="${str//[^-]}" + local numdashes=${#getdashes} + local lbuild=$(echo "$str" | cut -d- -f${numdashes}-) + local vers=$(echo "$str" | cut -d- -f$((numdashes-1))) + local comp=$(echo "$str" | cut -d- -f-$((numdashes-2))) # Generate reproducer instructions #local filename=reproducer_instructions-$comp-$vers-$lbuild local faildir=$ROOT_DIR/$comp/$vers/$lbuild From 5524ed656391a82bfbb6507aa5b71269f23a501e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 5 Mar 2024 19:48:16 -0700 Subject: [PATCH 179/326] cm_test_all_sandia: drop decommissioned/unavailable machines - remove voltrino, mayer --- scripts/cm_test_all_sandia | 45 -------------------------------------- 1 file changed, 45 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 9f4e7eb88f..95ce5c1f62 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -148,11 +148,6 @@ if [[ "$HOSTNAME" =~ weaver.* ]]; then module load git fi -if [[ "$HOSTNAME" =~ .*voltrino.* ]]; then - MACHINE=voltrino - module load git -fi - if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name MACHINE=blake fi @@ -165,15 +160,6 @@ if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then MACHINE=kokkos-dev-2 fi -if [[ "$HOSTNAME" == may* ]]; then - MACHINE=mayer -# module load git -fi - -if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name - MACHINE=mayer -fi - if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name MACHINE=caraway fi @@ -642,37 +628,6 @@ elif [ "$MACHINE" = "weaver" ]; then SPACK_HOST_ARCH="+power9" SPACK_CUDA_ARCH="+volta70" -elif [ "$MACHINE" = "voltrino" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,/,gcc/9.3.0" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi -elif [ "$MACHINE" = "mayer" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="cmake/3.17.1,/" - - ARMCLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMV8_THUNDERX2" - fi - - SPACK_HOST_ARCH="+armv8_tx2" elif [ "$MACHINE" = "caraway" ]; then SKIP_HWLOC=True # BUILD_ONLY=True From 865d84cf8c4189968b6be495e700928a94a579cf Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 6 Mar 2024 15:09:14 -0500 Subject: [PATCH 180/326] Fix2130 (#2132) * Fix #2130 - Do not call BsrMatrix spmv impl if block size is 1 - Instead, convert it to unmanaged CrsMatrix and call spmv again - cuSPARSE returned an error code in this case - Better performance * Formatting * Remove redundant remove_pointer_t Handle is already a non-pointer type --- sparse/src/KokkosSparse_spmv.hpp | 44 ++++++++++++++++++++----- sparse/src/KokkosSparse_spmv_handle.hpp | 29 +++++++++++++++- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index bcff9e29e9..0969c2f1e2 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -110,15 +110,20 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], "KokkosSparse::spmv: Output Vector must be non-const."); // Check that A, X, Y types match that of the Handle - static_assert( - std::is_same_v, - "KokkosSparse::spmv: AMatrix must be identical to Handle::AMatrixType"); - static_assert( - std::is_same_v, - "KokkosSparse::spmv: XVector must be identical to Handle::XVectorType"); - static_assert( - std::is_same_v, - "KokkosSparse::spmv: YVector must be identical to Handle::YVectorType"); + // But only check this if Handle is the user-facing type (SPMVHandle). + // We may internally call spmv with SPMVHandleImpl, which does not include + // the matrix and vector types. + if constexpr (KokkosSparse::Impl::is_spmv_handle_v) { + static_assert( + std::is_same_v, + "KokkosSparse::spmv: AMatrix must be identical to Handle::AMatrixType"); + static_assert( + std::is_same_v, + "KokkosSparse::spmv: XVector must be identical to Handle::XVectorType"); + static_assert( + std::is_same_v, + "KokkosSparse::spmv: YVector must be identical to Handle::YVectorType"); + } constexpr bool isBSR = Experimental::is_bsr_matrix_v; @@ -167,6 +172,7 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], return; } + // Get the "impl" parent class of Handle, if it's not already the impl using HandleImpl = typename Handle::ImplType; using ACrs_Internal = CrsMatrix< @@ -181,6 +187,26 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], using AMatrix_Internal = std::conditional_t; + // Intercept special case: A is a BsrMatrix with blockDim() == 1 + // This is exactly equivalent to CrsMatrix (more performant) + // and cuSPARSE actually errors out in that case. + // + // This relies on the fact that this codepath will always be taken for + // this particular matrix (so internally, this handle is only ever used for + // Crs) + if constexpr (isBSR) { + if (A.blockDim() == 1) { + // Construct an ACrs_Internal (unmanaged memory) from A's views + typename ACrs_Internal::row_map_type rowmap(A.graph.row_map); + typename ACrs_Internal::index_type entries(A.graph.entries); + typename ACrs_Internal::values_type values(A.values); + ACrs_Internal ACrs(std::string{}, A.numRows(), A.numCols(), A.nnz(), + values, rowmap, entries); + spmv(space, handle->get_impl(), mode, alpha, ACrs, x, beta, y); + return; + } + } + AMatrix_Internal A_i(A); // Note: data_type of a View includes both the scalar and rank diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index 31e21481af..2ce32a1f98 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -251,6 +251,9 @@ template struct SPMVHandleImpl { using ExecutionSpaceType = ExecutionSpace; + // This is its own ImplType + using ImplType = + SPMVHandleImpl; // Do not allow const qualifier on Scalar, Ordinal, Offset (otherwise this // type won't match the ETI'd type). Users should not use SPMVHandleImpl // directly and SPMVHandle explicitly removes const, so this should never @@ -268,6 +271,10 @@ struct SPMVHandleImpl { void set_exec_space(const ExecutionSpace& exec) { if (tpl) tpl->set_exec_space(exec); } + + /// Get the SPMVAlgorithm used by this handle + SPMVAlgorithm get_algorithm() const { return this->algo; } + bool is_set_up = false; const SPMVAlgorithm algo = SPMV_DEFAULT; TPL_SpMV_Data* tpl = nullptr; @@ -385,9 +392,29 @@ struct SPMVHandle } /// Get the SPMVAlgorithm used by this handle - SPMVAlgorithm get_algorithm() const { return this->algo; } + SPMVAlgorithm get_algorithm() const { + // Note: get_algorithm is also a method of parent ImplType, but for + // documentation purposes it should appear directly in the public interface + // of SPMVHandle + return this->algo; + } + + /// Get pointer to this as the impl type + ImplType* get_impl() { return static_cast(this); } }; +namespace Impl { +template +struct is_spmv_handle : public std::false_type {}; +template +struct is_spmv_handle> : public std::true_type {}; +template +struct is_spmv_handle> : public std::true_type {}; + +template +inline constexpr bool is_spmv_handle_v = is_spmv_handle::value; +} // namespace Impl + } // namespace KokkosSparse #endif From 74f0ed7898eccae4642fa3d69c3c0d20294c13de Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 7 Mar 2024 08:04:21 -0700 Subject: [PATCH 181/326] Benchmark: modifying spmv benchmark to run range of spmv tests (#2135) This could be further automated to run on matrix from suite sparse --- .../sparse/KokkosSparse_spmv_benchmark.cpp | 88 +++++++++++++------ 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 4ae0a34168..6adf55b26e 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -35,13 +35,20 @@ namespace { struct spmv_parameters { - int N, offset; + int N, offset, numvecs; + std::string mode; std::string filename; std::string alg; std::string tpl; spmv_parameters(const int N_) - : N(N_), offset(0), filename(""), alg(""), tpl("") {} + : N(N_), + offset(0), + numvecs(1), + mode(""), + filename(""), + alg(""), + tpl("") {} }; void print_options() { @@ -49,9 +56,11 @@ void print_options() { std::cerr << perf_test::list_common_options(); - std::cerr - << "\t[Optional] --repeat :: how many times to repeat overall test" - << std::endl; + std::cerr << "\t[Optional] --mode :: whether to run a suite of " + << "automated test or manually define one (auto, manual)" + << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + << "test" << std::endl; std::cerr << " -n [N] :: generate a semi-random banded (band size " "0.01xN)\n" "NxN matrix with average of 10 entries per row." @@ -59,25 +68,30 @@ void print_options() { std::cerr << "\t[Optional] --alg :: the algorithm to run (default, " "native, merge)" << std::endl; - std::cerr - << "\t[Optional] --alg :: the algorithm to run (classic, merge)" - << std::endl; std::cerr << "\t[Optional] --TPL :: when available and compatible with " "alg, a TPL can be used (cusparse, rocsparse, MKL)" << std::endl; - std::cerr - << " -f [file] : Read in Matrix Market formatted text file 'file'." - << std::endl; + std::cerr << " -f [file] : Read in Matrix Market formatted text file" + << " 'file'." << std::endl; std::cerr << " --offset [O] : Subtract O from every index.\n" << " Useful in case the matrix market file is " "not 0 based." << std::endl; + std::cerr << " --num_vecs : The number of vectors stored in X and Y" + << std::endl; } // print_options void parse_inputs(int argc, char** argv, spmv_parameters& params) { for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--mode", params.alg)) { + if ((params.mode != "") && (params.mode != "auto") && + (params.alg != "manual")) { + throw std::runtime_error( + "--mode can only be an empty string, `auto` or `manual`!"); + } + ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { if ((params.alg != "") && (params.alg != "default") && (params.alg != "native") && (params.alg != "merge")) { @@ -93,6 +107,9 @@ void parse_inputs(int argc, char** argv, spmv_parameters& params) { } else if (perf_test::check_arg_int(i, argc, argv, "--offset", params.offset)) { ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--num_vecs", + params.numvecs)) { + ++i; } else { print_options(); KK_USER_REQUIRE_MSG(false, "Unrecognized command line argument #" @@ -105,13 +122,21 @@ template void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { using matrix_type = KokkosSparse::CrsMatrix; - using mv_type = Kokkos::View; - - KokkosKernels::Experimental::Controls controls; - if ((inputs.alg == "default") || (inputs.alg == "native") || - (inputs.alg == "merge")) { - controls.setParameter("algorithm", inputs.alg); + using mv_type = Kokkos::View; + using handle_t = + KokkosSparse::SPMVHandle; + + KokkosSparse::SPMVAlgorithm spmv_alg; + if ((inputs.alg == "default") || (inputs.alg == "")) { + spmv_alg = KokkosSparse::SPMVAlgorithm::SPMV_DEFAULT; + } else if (inputs.alg == "native") { + spmv_alg = KokkosSparse::SPMVAlgorithm::SPMV_NATIVE; + } else if (inputs.alg == "merge") { + spmv_alg = KokkosSparse::SPMVAlgorithm::SPMV_MERGE_PATH; + } else { + throw std::runtime_error("invalid spmv algorithm"); } + handle_t handle(spmv_alg); // Create test matrix srand(17312837); @@ -126,8 +151,8 @@ void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { } // Create input vectors - mv_type x("X", A.numRows()); - mv_type y("Y", A.numCols()); + mv_type x("X", A.numRows(), inputs.numvecs); + mv_type y("Y", A.numCols(), inputs.numvecs); Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x, rand_pool, 10); @@ -136,7 +161,7 @@ void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { // Run the actual experiments for (auto _ : state) { - KokkosSparse::spmv(controls, KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); + KokkosSparse::spmv(&handle, KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); Kokkos::fence(); } } @@ -159,12 +184,25 @@ int main(int argc, char** argv) { spmv_parameters inputs(100000); parse_inputs(argc, argv, inputs); - // Google benchmark will report the wrong n if an input file matrix is used. - KokkosKernelsBenchmark::register_benchmark_real_time( - bench_name.c_str(), run_spmv, {"n"}, - {inputs.N}, common_params.repeat, inputs); - benchmark::RunSpecifiedBenchmarks(); + if ((inputs.mode == "") || (inputs.mode == "auto")) { + for (int n : {10000, 20000, 40000, 100000, 250000, 1000000}) { + for (int nv : {1, 2, 3, 4, 10}) { + inputs.N = n; + inputs.numvecs = nv; + KokkosKernelsBenchmark::register_benchmark_real_time( + bench_name.c_str(), run_spmv, + {"n", "nv"}, {inputs.N, inputs.numvecs}, common_params.repeat, + inputs); + } + } + } else { + // Google benchmark will report the wrong n if an input file matrix is used. + KokkosKernelsBenchmark::register_benchmark_real_time( + bench_name.c_str(), run_spmv, {"n"}, + {inputs.N}, common_params.repeat, inputs); + } + benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); Kokkos::finalize(); From 8f2945d0c99791345053fc839b1ea453354e03f9 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 7 Mar 2024 08:04:37 -0700 Subject: [PATCH 182/326] Kokkos Kernels: update version guards to drop old version of Kokkos (#2133) Since we are now in the 4.2 series we only support up to 4.1.00. Older version of Kokkos Core will require older version of Kokkos Kernels for compatibility. Once 4.3.00 is out we will move to drop support for the 4.1 series and only keep 4.2 and 4.3 series. --- batched/KokkosBatched_Util.hpp | 23 ------------------- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 2 -- .../KokkosBatched_SVD_Serial_Internal.hpp | 4 ---- common/impl/KokkosKernels_ViewUtils.hpp | 6 ----- sparse/impl/KokkosSparse_coo2crs_impl.hpp | 7 ------ .../KokkosSparse_spmv_bsrmatrix_impl_v42.hpp | 15 ------------ sparse/src/KokkosSparse_coo2crs.hpp | 6 ----- sparse/unit_test/Test_Sparse.hpp | 2 -- 8 files changed, 65 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 71c40482d6..fc14bd5a19 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -626,7 +626,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, const Trans::NoTranspose) { return subview_wrapper(v, i1, i2, i3, layout_tag); } -#if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, @@ -636,17 +635,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, return transpose_2d_view(sv_nt, layout_tag); } -#else -template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::ALL_t i2, Kokkos::ALL_t i3, - const BatchLayout::Left &layout_tag, - const Trans::Transpose) { - auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); - - return transpose_2d_view(sv_nt, layout_tag); -} -#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, @@ -670,7 +658,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { return subview_wrapper(v, i1, i2, i3, layout_tag); } -#if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, @@ -679,16 +666,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( return transpose_2d_view(sv_nt, layout_tag); } -#else -template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, - const BatchLayout::Right &layout_tag, const Trans::Transpose &) { - auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); - - return transpose_2d_view(sv_nt, layout_tag); -} -#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index f70fa6b963..464ea6d04a 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -93,11 +93,9 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, case BaseKokkosBatchedAlgos::KK_SERIAL: case BaseHeuristicAlgos::SQUARE: case BaseTplAlgos::ARMPL: -#if KOKKOS_VERSION > 40099 assert(A.rank_dynamic() == 3 && "AViewType must have rank 3."); assert(B.rank_dynamic() == 3 && "BViewType must have rank 3."); assert(C.rank_dynamic() == 3 && "CViewType must have rank 3."); -#endif break; default: std::ostringstream os; diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index c9fd0417f6..34c92c2d24 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -55,11 +55,7 @@ struct SerialSVDInternal { value_type a = Kokkos::ArithTraits::one(); value_type b = -a11 - a22; value_type c = a11 * a22 - a21 * a21; -#if KOKKOS_VERSION >= 30699 using Kokkos::sqrt; -#else - using Kokkos::Experimental::sqrt; -#endif value_type sqrtDet = sqrt(b * b - 4 * a * c); e1 = (-b + sqrtDet) / (2 * a); e2 = (-b - sqrtDet) / (2 * a); diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp index ac4abb6457..2ae8fb609d 100644 --- a/common/impl/KokkosKernels_ViewUtils.hpp +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -19,11 +19,6 @@ #include "Kokkos_Core.hpp" namespace KokkosKernels::Impl { -// lbv - 07/26/2023: -// MemoryTraits::impl_value was added -// in Kokkos 4.1.00 so we should guard -// the content of this header until v4.3.0 -#if KOKKOS_VERSION >= 40100 || defined(DOXY) /*! \brief Yields a type that is View with Kokkos::Unmanaged added to the memory * traits @@ -59,7 +54,6 @@ auto make_unmanaged(const View &v) { return typename with_unmanaged::type(v); } -#endif // KOKKOS_VERSION >= 40100 } // namespace KokkosKernels::Impl #endif diff --git a/sparse/impl/KokkosSparse_coo2crs_impl.hpp b/sparse/impl/KokkosSparse_coo2crs_impl.hpp index f11032903d..aaa5cdcb72 100644 --- a/sparse/impl/KokkosSparse_coo2crs_impl.hpp +++ b/sparse/impl/KokkosSparse_coo2crs_impl.hpp @@ -15,11 +15,6 @@ //@HEADER #ifndef KOKKOSSPARSE_COO2CRS_IMPL_HPP #define KOKKOSSPARSE_COO2CRS_IMPL_HPP -// The unorderedmap changes necessary for this to work -// have not made it into Kokkos 4.0.00 pr 4.0.01 will -// need to see if it happens in 4.1.00 to have a final -// version check here. -#if KOKKOS_VERSION >= 40099 #include #include "Kokkos_UnorderedMap.hpp" @@ -280,6 +275,4 @@ class Coo2Crs { } // namespace Impl } // namespace KokkosSparse -#endif // KOKKOS_VERSION >= 40099 - #endif // KOKKOSSPARSE_COO2CRS_IMPL_HPP diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp index 1c0d2fc361..a0f4ed1540 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp @@ -121,11 +121,6 @@ void apply_v42(const typename AMatrix::execution_space &exec, Kokkos::RangePolicy policy(exec, 0, y.size()); if constexpr (YVector::rank == 1) { -// lbv - 07/26/2023: -// with_unmanaged_t<...> required Kokkos 4.1.0, -// the content of this header will be guarded -// until v4.3.0 -#if KOKKOS_VERSION >= 40100 || defined(DOXY) // Implementation expects a 2D view, so create an unmanaged 2D view // with extent 1 in the second dimension using Y2D = KokkosKernels::Impl::with_unmanaged_t>; -#else - // Implementation expects a 2D view, so create an unmanaged 2D view - // with extent 1 in the second dimension - using Y2D = Kokkos::View< - typename YVector::value_type * [1], typename YVector::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits>; - using X2D = Kokkos::View< - typename XVector::value_type * [1], typename XVector::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits>; -#endif // KOKKOS_VERSION >= 40100 || defined(DOXY) const Y2D yu(y.data(), y.extent(0), 1); const X2D xu(x.data(), x.extent(0), 1); BsrSpmvV42NonTrans op(alpha, a, xu, beta, yu); diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index 45e54ce474..a29d818cb1 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -16,11 +16,6 @@ #ifndef _KOKKOSSPARSE_COO2CRS_HPP #define _KOKKOSSPARSE_COO2CRS_HPP -// The unorderedmap changes necessary for this to work -// have not made it into Kokkos 4.0.00 pr 4.0.01 will -// need to see if it happens in 4.1.00 to have a final -// version check here. -#if KOKKOS_VERSION >= 40099 || defined(DOXY) #include "KokkosSparse_CooMatrix.hpp" #include "KokkosSparse_CrsMatrix.hpp" @@ -99,5 +94,4 @@ auto coo2crs(KokkosSparse::CooMatrix= 40099 || defined(DOXY) #endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index 8ae06b598a..624cd86ff5 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -16,9 +16,7 @@ #ifndef TEST_SPARSE_HPP #define TEST_SPARSE_HPP -#if KOKKOS_VERSION >= 40099 #include "Test_Sparse_coo2crs.hpp" -#endif // KOKKOS_VERSION >= 40099 #include "Test_Sparse_crs2coo.hpp" #include "Test_Sparse_Controls.hpp" #include "Test_Sparse_CrsMatrix.hpp" From 519ef7b2919745bb199afafc3715c864168da4ab Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 12 Mar 2024 14:12:40 -0600 Subject: [PATCH 183/326] ODE: BDF methods (#1930) * ODE: adding BDF algorithms Implementing BDF formula for stiff ODEs. Orders 1 to 5 are available and tested. The integrators can be called on GPU to solve multiple systems in parallel. * ODE: fixing storage handling for start-up RK stack * ODE: clang-format * ODE: first adaptive version of BDF The current implementation only allows for adaptivity in time, at this point the BDF Step actually converges as expected with first order integration! * ODE: fixing issues with adaptive BDF The unit-test BDF_adaptive now shows the integration of the logistic equation using adaptive time steps and increasing integration order from 1 to 5. * ODE: running BDF on StiffChemistry problem The problem runs fine and is solved but there are oscillations while the behavior of the solution is smooth. More investigation is needed... * BDF: fixing types and template parameters in batched calls Bascially we need template parameters to be more versatile and cannot assume that all rank1 views will have the exact same underlying type, for instance layouts can be different. * More fixes for GPUs only in tests this time. * ODE: BDF adaptive, fix small bug After adding rhs and update vectors to temp the subviews taken for other variables need to be offset appropriately... * Revert "More fixes for GPUs only in tests this time." This reverts commit 2f70432761485bc6a4c65a1833e7299dd2c340e2. * Revert "Revert "More fixes for GPUs only in tests this time."" This reverts commit 836012bb529551727b3f5913057acad94dfe60df. * ODE: BDF small change to temporarily avoid compile time issue True fix involving a KOKKOS_VERSION check is upcoming after more tests on GPU side... * ODE: BDF fix for some printf statements that will go away soon... * ODE: adding benchmark for BDF The benchmark helps us monitor the performance of the BDF implementaiton across multiple platforms as well as impact of changes over time. * ODE: improve benchmark interface... * ODE: BDF changes to use RMS norm and change some default values Small changes to compare more closely with reference implementation. Some of these might be reverted eventually but that's fine for now. * ODE: BDF convergence more stable and results look pretty good now! Changing the Newton solver convergence criteria as well as changing a few default input parameters leads to a more stable algorithms which can now integrate the stiff Henderson autocatalytic example well in 66 time steps instead of 200k for fixed order integration... * ODE: BDF fix bug in initial time step calculation The initial step routine was overwriting the initial right hand side which led to obvious issues further down the road... now things should work fine. Need to figure out if I can re-initialize the variables in the perf test while excluding that time from each iteration. * ODE: BDF removing bad print statement... std::cout in device code * ODE - BDF: improving perf test Basically adding new untimed setup within the main loop of the benchmark to reset the intial conditions, buffers and vectors ahead of each iteration. * Modifying unit-test to catch proper return type * Applying clang-format --- .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 36 +- batched/dense/src/KokkosBatched_Gesv.hpp | 13 +- ode/impl/KokkosODE_BDF_impl.hpp | 532 +++++++++++ ode/impl/KokkosODE_Newton_impl.hpp | 55 +- ode/src/KokkosODE_BDF.hpp | 227 +++++ ode/src/KokkosODE_Newton.hpp | 10 +- ode/src/KokkosODE_Types.hpp | 13 +- ode/unit_test/Test_ODE.hpp | 1 + ode/unit_test/Test_ODE_BDF.hpp | 830 ++++++++++++++++++ ode/unit_test/Test_ODE_Newton.hpp | 31 +- perf_test/ode/CMakeLists.txt | 4 + perf_test/ode/KokkosODE_BDF.cpp | 266 ++++++ 12 files changed, 1971 insertions(+), 47 deletions(-) create mode 100644 ode/impl/KokkosODE_BDF_impl.hpp create mode 100644 ode/src/KokkosODE_BDF.hpp create mode 100644 ode/unit_test/Test_ODE_BDF.hpp create mode 100644 perf_test/ode/KokkosODE_BDF.cpp diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index e4e0d5b8b7..86d0d0873e 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -366,20 +366,24 @@ KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, /// =========== template <> struct SerialGesv { - template + template KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const VectorType X, - const VectorType Y, + const XVectorType X, + const YVectorType Y, const MatrixType tmp) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(XVectorType::rank == 1, + "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, + "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -462,20 +466,24 @@ struct SerialGesv { template <> struct SerialGesv { - template + template KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const VectorType X, - const VectorType Y, + const XVectorType X, + const YVectorType Y, const MatrixType /*tmp*/) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(XVectorType::rank == 1, + "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, + "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/src/KokkosBatched_Gesv.hpp b/batched/dense/src/KokkosBatched_Gesv.hpp index 3abedfd0aa..c4821db459 100644 --- a/batched/dense/src/KokkosBatched_Gesv.hpp +++ b/batched/dense/src/KokkosBatched_Gesv.hpp @@ -63,11 +63,18 @@ struct Gesv { template struct SerialGesv { - template + template KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const VectorType X, - const VectorType Y, + const XVectorType X, + const YVectorType Y, const MatrixType tmp); + + template + [[deprecated]] KOKKOS_INLINE_FUNCTION static int invoke( + const MatrixType A, const VectorType X, const VectorType Y, + const MatrixType tmp) { + return invoke(A, X, Y, tmp); + } }; /// \brief Team Batched GESV: diff --git a/ode/impl/KokkosODE_BDF_impl.hpp b/ode/impl/KokkosODE_BDF_impl.hpp new file mode 100644 index 0000000000..cf89731f1b --- /dev/null +++ b/ode/impl/KokkosODE_BDF_impl.hpp @@ -0,0 +1,532 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_BDF_IMPL_HPP +#define KOKKOSBLAS_BDF_IMPL_HPP + +#include "Kokkos_Core.hpp" + +#include "KokkosODE_Newton.hpp" +#include "KokkosBlas2_serial_gemv.hpp" +#include "KokkosBatched_Gemm_Decl.hpp" + +namespace KokkosODE { +namespace Impl { + +template +struct BDF_table {}; + +template <> +struct BDF_table<1> { + static constexpr int order = 1; + Kokkos::Array coefficients{{-1.0, 1.0}}; +}; + +template <> +struct BDF_table<2> { + static constexpr int order = 2; + Kokkos::Array coefficients{{-4.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0}}; +}; + +template <> +struct BDF_table<3> { + static constexpr int order = 3; + Kokkos::Array coefficients{ + {-18.0 / 11.0, 9.0 / 11.0, -2.0 / 11.0, 6.0 / 11.0}}; +}; + +template <> +struct BDF_table<4> { + static constexpr int order = 4; + Kokkos::Array coefficients{ + {-48.0 / 25.0, 36.0 / 25.0, -16.0 / 25.0, 3.0 / 25.0, 12.0 / 25.0}}; +}; + +template <> +struct BDF_table<5> { + static constexpr int order = 5; + Kokkos::Array coefficients{{-300.0 / 137.0, 300.0 / 137.0, + -200.0 / 137.0, 75.0 / 137.0, + -12.0 / 137.0, 60.0 / 137.0}}; +}; + +template <> +struct BDF_table<6> { + static constexpr int order = 6; + Kokkos::Array coefficients{ + {-360.0 / 147.0, 450.0 / 147.0, -400.0 / 147.0, 225.0 / 147.0, + -72.0 / 147.0, 10.0 / 147.0, 60.0 / 147.0}}; +}; + +template +struct BDF_system_wrapper { + const system_type mySys; + const int neqs; + const table_type table; + const int order = table.order; + + double t, dt; + mv_type yn; + + KOKKOS_FUNCTION + BDF_system_wrapper(const system_type& mySys_, const table_type& table_, + const double t_, const double dt_, const mv_type& yn_) + : mySys(mySys_), + neqs(mySys_.neqs), + table(table_), + t(t_), + dt(dt_), + yn(yn_) {} + + template + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + // f = f(t+dt, y) + mySys.evaluate_function(t, dt, y, f); + + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + f(eqIdx) = y(eqIdx) - table.coefficients[order] * dt * f(eqIdx); + for (int orderIdx = 0; orderIdx < order; ++orderIdx) { + f(eqIdx) += + table.coefficients[order - 1 - orderIdx] * yn(eqIdx, orderIdx); + } + } + } + + template + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + mySys.evaluate_jacobian(t, dt, y, jac); + + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = + -table.coefficients[order] * dt * jac(rowIdx, colIdx); + } + jac(rowIdx, rowIdx) += 1.0; + } + } +}; + +template +struct BDF_system_wrapper2 { + const system_type mySys; + const int neqs; + const subview_type psi; + const d_vec_type d; + + bool compute_jac = true; + double t, dt, c = 0; + + KOKKOS_FUNCTION + BDF_system_wrapper2(const system_type& mySys_, const subview_type& psi_, + const d_vec_type& d_, const double t_, const double dt_) + : mySys(mySys_), neqs(mySys_.neqs), psi(psi_), d(d_), t(t_), dt(dt_) {} + + template + KOKKOS_FUNCTION void residual(const YVectorType& y, + const FVectorType& f) const { + // f = f(t+dt, y) + mySys.evaluate_function(t, dt, y, f); + + // std::cout << "f = psi + d - c * f = " << psi(0) << " + " << d(0) << " - " + // << c << " * " << f(0) << std::endl; + + // rhs = higher order terms + y_{n+1}^i - y_n - dt*f + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + f(eqIdx) = psi(eqIdx) + d(eqIdx) - c * f(eqIdx); + } + } + + template + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + if (compute_jac) { + mySys.evaluate_jacobian(t, dt, y, jac); + + // J = I - dt*(dy/dy) + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = -dt * jac(rowIdx, colIdx); + } + jac(rowIdx, rowIdx) += 1.0; + } + } + } +}; + +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, + scalar_type t, scalar_type dt, + const vec_type& y_old, const vec_type& y_new, + const vec_type& rhs, const vec_type& update, + const vec_type& scale, const mv_type& y_vecs, + const mat_type& temp, const mat_type& jac) { + using newton_params = KokkosODE::Experimental::Newton_params; + + BDF_system_wrapper sys(ode, table, t, dt, y_vecs); + const newton_params param(50, 1e-14, 1e-12); + + // first set y_new = y_old + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + y_new(eqIdx) = y_old(eqIdx); + } + + // solver the nonlinear problem + { + KokkosODE::Experimental::Newton::Solve(sys, param, jac, temp, y_new, rhs, + update, scale); + } + +} // BDFStep + +template +KOKKOS_FUNCTION void compute_coeffs(const int order, const scalar_type factor, + const mat_type& coeffs) { + coeffs(0, 0) = 1.0; + for (int colIdx = 0; colIdx < order; ++colIdx) { + coeffs(0, colIdx + 1) = 1.0; + for (int rowIdx = 0; rowIdx < order; ++rowIdx) { + coeffs(rowIdx + 1, colIdx + 1) = + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * + coeffs(rowIdx, colIdx + 1); + } + } +} + +template +KOKKOS_FUNCTION void update_D(const int order, const scalar_type factor, + const mat_type& coeffs, const mat_type& tempD, + const mat_type& D) { + auto subD = + Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(0, order + 1)); + auto subTempD = Kokkos::subview(tempD, Kokkos::ALL(), + Kokkos::pair(0, order + 1)); + + compute_coeffs(order, factor, coeffs); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subD, R, 0.0, subTempD); + + compute_coeffs(order, 1.0, coeffs); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subTempD, U, 0.0, subD); +} + +template +KOKKOS_FUNCTION void initial_step_size( + const ode_type ode, const int order, const scalar_type t0, + const scalar_type atol, const scalar_type rtol, const vec_type& y0, + const res_type& f0, const mat_type& temp, scalar_type& dt_ini) { + using KAT = Kokkos::ArithTraits; + + // Extract subviews to store intermediate data + auto scale = Kokkos::subview(temp, Kokkos::ALL(), 1); + auto y1 = Kokkos::subview(temp, Kokkos::ALL(), 2); + auto f1 = Kokkos::subview(temp, Kokkos::ALL(), 3); + + // Compute norms for y0 and f0 + double n0 = KAT::zero(), n1 = KAT::zero(), dt0; + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + scale(eqIdx) = atol + rtol * Kokkos::abs(y0(eqIdx)); + n0 += Kokkos::pow(y0(eqIdx) / scale(eqIdx), 2); + n1 += Kokkos::pow(f0(eqIdx) / scale(eqIdx), 2); + } + n0 = Kokkos::sqrt(n0) / Kokkos::sqrt(ode.neqs); + n1 = Kokkos::sqrt(n1) / Kokkos::sqrt(ode.neqs); + + // Select dt0 + if ((n0 < 1e-5) || (n1 < 1e-5)) { + dt0 = 1e-6; + } else { + dt0 = 0.01 * n0 / n1; + } + + // Estimate y at t0 + dt0 + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y1(eqIdx) = y0(eqIdx) + dt0 * f0(eqIdx); + } + + // Compute f at t0+dt0 and y1, + // then compute the norm of f(t0+dt0, y1) - f(t0, y0) + scalar_type n2 = KAT::zero(); + ode.evaluate_function(t0 + dt0, dt0, y1, f1); + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + n2 += Kokkos::pow((f1(eqIdx) - f0(eqIdx)) / scale(eqIdx), 2); + } + n2 = Kokkos::sqrt(n2) / (dt0 * Kokkos::sqrt(ode.neqs)); + + // Finally select initial time step dt_ini + if ((n1 <= 1e-15) && (n2 <= 1e-15)) { + dt_ini = Kokkos::max(1e-6, dt0 * 1e-3); + } else { + dt_ini = Kokkos::pow(0.01 / Kokkos::max(n1, n2), KAT::one() / (order + 1)); + } + + dt_ini = Kokkos::min(100 * dt0, dt_ini); + + // Zero out temp variables just to be safe... + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + scale(eqIdx) = 0; + y1(eqIdx) = 0; + f1(eqIdx) = 0; + } +} // initial_step_size + +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, + scalar_type t_end, int& order, + int& num_equal_steps, const int max_newton_iters, + const scalar_type atol, const scalar_type rtol, + const scalar_type min_factor, + const vec_type& y_old, const vec_type& y_new, + const res_type& rhs, const res_type& update, + const mat_type& temp, const mat_type& temp2) { + using newton_params = KokkosODE::Experimental::Newton_params; + + constexpr int max_order = 5; + + // For NDF coefficients see Sahmpine and Reichelt, The Matlab ODE suite, SIAM + // SISCm 18, 1, p1-22, January 1997 Kokkos::Array kappa{{0., + // -0.1850, -1/9 , -0.0823000, -0.0415000, 0.}}; // NDF coefficients + // kappa gamma(i) = sum_{k=1}^i(1.0 / k); gamma(0) = 0; // NDF coefficients + // gamma_k alpha(i) = (1 - kappa(i)) * gamma(i) error_const(i) = kappa(i) * + // gamma(i) + 1 / (i + 1) + const Kokkos::Array alpha{ + {0., 1.185, 1.66666667, 1.98421667, 2.16979167, 2.28333333}}; + const Kokkos::Array error_const{ + {1., 0.315, 0.16666667, 0.09911667, 0.11354167, 0.16666667}}; + + // Extract columns of temp to form temporary + // subviews to operate on. + // const int numRows = temp.extent_int(0); const int numCols = + // temp.extent_int(1); std::cout << "numRows: " << numRows << ", numCols: " << + // numCols << std::endl; std::cout << "Extract subview from temp" << + // std::endl; + int offset = 2; + auto D = Kokkos::subview( + temp, Kokkos::ALL(), + Kokkos::pair(offset, offset + 8)); // y and its derivatives + offset += 8; + auto tempD = Kokkos::subview(temp, Kokkos::ALL(), + Kokkos::pair(offset, offset + 8)); + offset += 8; + auto scale = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Scaling coefficients for error calculation + auto y_predict = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Initial guess for y_{n+1} + auto psi = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Higher order terms contribution to rhs + auto error = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Error estimate + auto jac = Kokkos::subview( + temp, Kokkos::ALL(), + Kokkos::pair(offset, offset + ode.neqs)); // Jacobian matrix + offset += ode.neqs; + auto tmp_gesv = Kokkos::subview( + temp, Kokkos::ALL(), + Kokkos::pair( + offset, offset + ode.neqs + 4)); // Buffer space for gesv calculation + offset += ode.neqs + 4; + + auto coeffs = + Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::pair(0, 6)); + auto gamma = Kokkos::subview(temp2, Kokkos::ALL(), 6); + gamma(0) = 0.0; + gamma(1) = 1.0; + gamma(2) = 1.5; + gamma(3) = 1.83333333; + gamma(4) = 2.08333333; + gamma(5) = 2.28333333; + + BDF_system_wrapper2 sys(ode, psi, update, t, dt); + const newton_params param( + max_newton_iters, atol, + Kokkos::max(10 * Kokkos::ArithTraits::eps() / rtol, + Kokkos::min(0.03, Kokkos::sqrt(rtol)))); + + scalar_type max_step = Kokkos::ArithTraits::max(); + scalar_type min_step = Kokkos::ArithTraits::min(); + scalar_type safety = 0.675, error_norm; + if (dt > max_step) { + update_D(order, max_step / dt, coeffs, tempD, D); + dt = max_step; + num_equal_steps = 0; + } else if (dt < min_step) { + update_D(order, min_step / dt, coeffs, tempD, D); + dt = min_step; + num_equal_steps = 0; + } + + // first set y_new = y_old + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + y_new(eqIdx) = y_old(eqIdx); + } + + double t_new = 0; + bool step_accepted = false; + while (!step_accepted) { + if (dt < min_step) { + return; + } + t_new = t + dt; + + if (t_new > t_end) { + t_new = t_end; + update_D(order, (t_new - t) / dt, coeffs, tempD, D); + num_equal_steps = 0; + } + dt = t_new - t; + + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + y_predict(eqIdx) = 0; + for (int orderIdx = 0; orderIdx < order + 1; ++orderIdx) { + y_predict(eqIdx) += D(eqIdx, orderIdx); + } + scale(eqIdx) = atol + rtol * Kokkos::abs(y_predict(eqIdx)); + } + + // Compute psi, the sum of the higher order + // contribution to the residual + auto subD = + Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(1, order + 1)); + auto subGamma = + Kokkos::subview(gamma, Kokkos::pair(1, order + 1)); + KokkosBlas::Experimental::serial_gemv('N', 1.0 / alpha[order], subD, + subGamma, 0.0, psi); + + sys.compute_jac = true; + sys.c = dt / alpha[order]; + sys.jacobian(y_new, jac); + sys.compute_jac = true; + Kokkos::Experimental::local_deep_copy(y_new, y_predict); + Kokkos::Experimental::local_deep_copy(update, 0); + KokkosODE::Experimental::newton_solver_status newton_status = + KokkosODE::Experimental::Newton::Solve(sys, param, jac, tmp_gesv, y_new, + rhs, update, scale); + + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + update(eqIdx) = y_new(eqIdx) - y_predict(eqIdx); + } + + if (newton_status == + KokkosODE::Experimental::newton_solver_status::MAX_ITER) { + dt = 0.5 * dt; + update_D(order, 0.5, coeffs, tempD, D); + num_equal_steps = 0; + + } else { + // Estimate the solution error + safety = 0.9 * (2 * max_newton_iters + 1) / + (2 * max_newton_iters + param.iters); + error_norm = 0; + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + scale(eqIdx) = atol + rtol * Kokkos::abs(y_new(eqIdx)); + error(eqIdx) = error_const[order] * update(eqIdx) / scale(eqIdx); + error_norm += error(eqIdx) * error(eqIdx); + } + error_norm = Kokkos::sqrt(error_norm) / Kokkos::sqrt(sys.neqs); + + // Check error norm and adapt step size or accept step + if (error_norm > 1) { + scalar_type factor = Kokkos::max( + min_factor, safety * Kokkos::pow(error_norm, -1.0 / (order + 1))); + dt = factor * dt; + update_D(order, factor, coeffs, tempD, D); + num_equal_steps = 0; + } else { + step_accepted = true; + } + } + } // while(!step_accepted) + + // Now that our time step has been + // accepted we update all our states + // and see if we can adapt the order + // or the time step before going to + // the next step. + ++num_equal_steps; + t = t_new; + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + D(eqIdx, order + 2) = update(eqIdx) - D(eqIdx, order + 1); + D(eqIdx, order + 1) = update(eqIdx); + for (int orderIdx = order; 0 <= orderIdx; --orderIdx) { + D(eqIdx, orderIdx) += D(eqIdx, orderIdx + 1); + } + } + + // Not enough steps at constant dt + // have been succesfull so we do not + // attempt order adaptation. + double error_low = 0, error_high = 0; + if (num_equal_steps < order + 1) { + return; + } + + if (1 < order) { + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + error_low += Kokkos::pow( + error_const[order - 1] * D(eqIdx, order) / scale(eqIdx), 2); + } + error_low = Kokkos::sqrt(error_low) / Kokkos::sqrt(sys.neqs); + } else { + error_low = Kokkos::ArithTraits::max(); + } + + if (order < max_order) { + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + error_high += Kokkos::pow( + error_const[order + 1] * D(eqIdx, order + 2) / scale(eqIdx), 2); + } + error_high = Kokkos::sqrt(error_high) / Kokkos::sqrt(sys.neqs); + } else { + error_high = Kokkos::ArithTraits::max(); + } + + double factor_low, factor_mid, factor_high, factor; + factor_low = Kokkos::pow(error_low, -1.0 / order); + factor_mid = Kokkos::pow(error_norm, -1.0 / (order + 1)); + factor_high = Kokkos::pow(error_high, -1.0 / (order + 2)); + + int delta_order = 0; + if ((factor_mid < factor_low) && (factor_high < factor_low)) { + delta_order = -1; + factor = factor_low; + } else if ((factor_low < factor_high) && (factor_mid < factor_high)) { + delta_order = 1; + factor = factor_high; + } else { + delta_order = 0; + factor = factor_mid; + } + order += delta_order; + factor = Kokkos::fmin(10, safety * factor); + dt *= factor; + + update_D(order, factor, coeffs, tempD, D); + num_equal_steps = 0; + +} // BDFStep + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSBLAS_BDF_IMPL_HPP diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp index d5000a74ab..348bf0aa22 100644 --- a/ode/impl/KokkosODE_Newton_impl.hpp +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -30,18 +30,29 @@ namespace KokkosODE { namespace Impl { -template +template KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( system_type& sys, const KokkosODE::Experimental::Newton_params& params, - mat_type& J, mat_type& tmp, vec_type& y0, vec_type& rhs, vec_type& update) { + mat_type& J, mat_type& tmp, ini_vec_type& y0, rhs_vec_type& rhs, + update_type& update, const scale_type& scale) { using newton_solver_status = KokkosODE::Experimental::newton_solver_status; - using value_type = typename vec_type::non_const_value_type; + using value_type = typename ini_vec_type::non_const_value_type; // Define the type returned by nrm2 to store // the norm of the residual. using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename vec_type::non_const_value_type>::mag_type; - norm_type norm = Kokkos::ArithTraits::zero(); + typename ini_vec_type::non_const_value_type>::mag_type; + sys.residual(y0, rhs); + const norm_type norm0 = KokkosBlas::serial_nrm2(rhs); + norm_type norm = Kokkos::ArithTraits::zero(); + norm_type norm_old = Kokkos::ArithTraits::zero(); + norm_type norm_new = Kokkos::ArithTraits::zero(); + norm_type rate = Kokkos::ArithTraits::zero(); + + const norm_type tol = + Kokkos::max(10 * Kokkos::ArithTraits::eps() / params.rel_tol, + Kokkos::min(0.03, Kokkos::sqrt(params.rel_tol))); // LBV - 07/24/2023: for now assume that we take // a full Newton step. Eventually this value can @@ -57,12 +68,6 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( // Solve the following linearized // problem at each iteration: J*update=-rhs // with J=du/dx, rhs=f(u_n+update)-f(u_n) - norm = KokkosBlas::serial_nrm2(rhs); - - if ((norm < params.rel_tol) || - (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { - return newton_solver_status::NLS_SUCCESS; - } // compute LHS sys.jacobian(y0, J); @@ -73,6 +78,26 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( J, update, rhs, tmp); KokkosBlas::SerialScale::invoke(-1, update); + // update solution // x = x + alpha*update + KokkosBlas::serial_axpy(alpha, update, y0); + norm = KokkosBlas::serial_nrm2(rhs); + + // Compute rms norm of the scaled update + for (int idx = 0; idx < sys.neqs; ++idx) { + norm_new = (update(idx) * update(idx)) / (scale(idx) * scale(idx)); + } + norm_new = Kokkos::sqrt(norm_new / sys.neqs); + if ((it > 0) && norm_old > Kokkos::ArithTraits::zero()) { + rate = norm_new / norm_old; + if ((rate >= 1) || + Kokkos::pow(rate, params.max_iters - it) / (1 - rate) * norm_new > + tol) { + return newton_solver_status::NLS_DIVERGENCE; + } else if ((norm_new == 0) || ((rate / (1 - rate)) * norm_new < tol)) { + return newton_solver_status::NLS_SUCCESS; + } + } + if (linSolverStat == 1) { #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -83,8 +108,12 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( return newton_solver_status::LIN_SOLVE_FAIL; } - // update solution // x = x + alpha*update - KokkosBlas::serial_axpy(alpha, update, y0); + if ((norm < (params.rel_tol * norm0)) || + (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { + return newton_solver_status::NLS_SUCCESS; + } + + norm_old = norm_new; } return newton_solver_status::MAX_ITER; } diff --git a/ode/src/KokkosODE_BDF.hpp b/ode/src/KokkosODE_BDF.hpp new file mode 100644 index 0000000000..71a450a1c6 --- /dev/null +++ b/ode/src/KokkosODE_BDF.hpp @@ -0,0 +1,227 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_BDF_HPP +#define KOKKOSODE_BDF_HPP + +/// \author Luc Berger-Vergiat (lberge@sandia.gov) +/// \file KokkosODE_BDF.hpp + +#include "Kokkos_Core.hpp" +#include "KokkosODE_Types.hpp" +#include "KokkosODE_RungeKutta.hpp" + +#include "KokkosODE_BDF_impl.hpp" + +namespace KokkosODE { +namespace Experimental { + +enum BDF_type : int { + BDF1 = 0, + BDF2 = 1, + BDF3 = 2, + BDF4 = 3, + BDF5 = 4, + BDF6 = 5 +}; + +template +struct BDF_coeff_helper { + using table_type = void; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<1>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<2>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<3>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<4>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<5>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<6>; + + BDF_coeff_helper() = default; +}; + +template +struct BDF { + using table_type = typename BDF_coeff_helper::table_type; + + template + KOKKOS_FUNCTION static void Solve( + const ode_type& ode, const scalar_type t_start, const scalar_type t_end, + const int num_steps, const vec_type& y0, const vec_type& y, + const vec_type& rhs, const vec_type& update, const vec_type& scale, + const mv_type& y_vecs, const mv_type& kstack, const mat_type& temp, + const mat_type& jac) { + const table_type table{}; + + const double dt = (t_end - t_start) / num_steps; + double t = t_start; + + // Load y0 into y_vecs(:, 0) + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y_vecs(eqIdx, 0) = y0(eqIdx); + } + + // Compute initial start-up history vectors + // Using a non adaptive explicit method. + const int init_steps = table.order - 1; + if (num_steps < init_steps) { + return; + } + KokkosODE::Experimental::ODE_params params(table.order - 1); + for (int stepIdx = 0; stepIdx < init_steps; ++stepIdx) { + KokkosODE::Experimental::RungeKutta::Solve( + ode, params, t, t + dt, y0, y, update, kstack); + + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y_vecs(eqIdx, stepIdx + 1) = y(eqIdx); + y0(eqIdx) = y(eqIdx); + } + t += dt; + } + + for (int stepIdx = init_steps; stepIdx < num_steps; ++stepIdx) { + KokkosODE::Impl::BDFStep(ode, table, t, dt, y0, y, rhs, update, scale, + y_vecs, temp, jac); + + // Update history + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y0(eqIdx) = y(eqIdx); + for (int orderIdx = 0; orderIdx < table.order - 1; ++orderIdx) { + y_vecs(eqIdx, orderIdx) = y_vecs(eqIdx, orderIdx + 1); + } + y_vecs(eqIdx, table.order - 1) = y(eqIdx); + } + t += dt; + } + } // Solve() +}; + +/// \brief BDF Solve integrates an ordinary differential equation +/// using an order and time adaptive BDF method. +/// +/// The integration starts with a BDF1 method and adaptively increases +/// or decreases both dt and the order of integration based on error +/// estimators. This function is marked as KOKKOS_FUNCTION so it can +/// be called on host and device. +/// +/// \tparam ode_type the type of the ode object to integrated +/// \tparam mv_type a rank-2 view +/// \tparam vec_type a rank-1 view +/// +/// \param ode [in]: the ode to integrate +/// \param t_start [in]: time at which the integration starts +/// \param t_end [in]: time at which the integration stops +/// \param initial_step [in]: initial value for dt +/// \param max_step [in]: maximum value for dt +/// \param y0 [in/out]: vector of initial conditions, set to the solution +/// at the end of the integration +/// \param y_new [out]: vector of solution at t_end +/// \param temp [in]: vectors for temporary storage +/// \param temp2 [in]: vectors for temporary storage +template +KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, + const scalar_type t_end, + const scalar_type initial_step, + const scalar_type max_step, const vec_type& y0, + const vec_type& y_new, mat_type& temp, + mat_type& temp2) { + using KAT = Kokkos::ArithTraits; + + // This needs to go away and be pulled out of temp instead... + auto rhs = Kokkos::subview(temp, Kokkos::ALL(), 0); + auto update = Kokkos::subview(temp, Kokkos::ALL(), 1); + // vec_type rhs("rhs", ode.neqs), update("update", ode.neqs); + (void)max_step; + + int order = 1, num_equal_steps = 0; + constexpr scalar_type min_factor = 0.2; + scalar_type dt = initial_step; + scalar_type t = t_start; + + constexpr int max_newton_iters = 10; + scalar_type atol = 1.0e-6, rtol = 1.0e-3; + + // Compute rhs = f(t_start, y0) + ode.evaluate_function(t_start, 0, y0, rhs); + + // Check if we need to compute the initial + // time step size. + if (initial_step == KAT::zero()) { + KokkosODE::Impl::initial_step_size(ode, order, t_start, atol, rtol, y0, rhs, + temp, dt); + } + + // Initialize D(:, 0) = y0 and D(:, 1) = dt*rhs + auto D = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(2, 10)); + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + D(eqIdx, 0) = y0(eqIdx); + D(eqIdx, 1) = dt * rhs(eqIdx); + rhs(eqIdx) = 0; + } + + // Now we loop over the time interval [t_start, t_end] + // and solve our ODE. + while (t < t_end) { + KokkosODE::Impl::BDFStep(ode, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, min_factor, y0, + y_new, rhs, update, temp, temp2); + + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y0(eqIdx) = y_new(eqIdx); + } + // printf("t=%f, dt=%f, y={%f, %f, %f}\n", t, dt, y0(0), y0(1), y0(2)); + } +} // BDFSolve + +} // namespace Experimental +} // namespace KokkosODE + +#endif // KOKKOSODE_BDF_HPP diff --git a/ode/src/KokkosODE_Newton.hpp b/ode/src/KokkosODE_Newton.hpp index 94c96e2eea..ffccba5cd3 100644 --- a/ode/src/KokkosODE_Newton.hpp +++ b/ode/src/KokkosODE_Newton.hpp @@ -30,12 +30,14 @@ namespace Experimental { /// \brief Newton solver for non-linear system of equations struct Newton { - template + template KOKKOS_FUNCTION static newton_solver_status Solve( const system_type& sys, const Newton_params& params, const mat_type& J, - const mat_type& tmp, const vec_type& y0, const vec_type& rhs, - const vec_type& update) { - return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update); + const mat_type& tmp, const ini_vec_type& y0, const rhs_vec_type& rhs, + const update_type& update, const scale_type& scale) { + return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update, + scale); } }; diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp index 7d78227526..5fb2c44846 100644 --- a/ode/src/KokkosODE_Types.hpp +++ b/ode/src/KokkosODE_Types.hpp @@ -54,16 +54,19 @@ struct ODE_params { enum newton_solver_status : int { NLS_SUCCESS = 0, MAX_ITER = 1, - LIN_SOLVE_FAIL = 2 + LIN_SOLVE_FAIL = 2, + NLS_DIVERGENCE = 3, }; struct Newton_params { - int max_iters; + int max_iters, iters = 0; double abs_tol, rel_tol; - // Constructor that only specify the desired number of steps. - // In this case no adaptivity is provided, the time step will - // be constant such that dt = (tend - tstart) / num_steps; + // Constructor that sets basic solver parameters + // used while solving the nonlinear system + // int max_iters_ [in]: maximum number of iterations allowed + // double abs_tol_ [in]: absolute tolerance to reach for successful solve + // double rel_tol_ [in]: relative tolerance to reach for successful solve KOKKOS_FUNCTION Newton_params(const int max_iters_, const double abs_tol_, const double rel_tol_) diff --git a/ode/unit_test/Test_ODE.hpp b/ode/unit_test/Test_ODE.hpp index 5d4861879b..1b55171581 100644 --- a/ode/unit_test/Test_ODE.hpp +++ b/ode/unit_test/Test_ODE.hpp @@ -22,5 +22,6 @@ // Implicit integrators #include "Test_ODE_Newton.hpp" +#include "Test_ODE_BDF.hpp" #endif // TEST_ODE_HPP diff --git a/ode/unit_test/Test_ODE_BDF.hpp b/ode/unit_test/Test_ODE_BDF.hpp new file mode 100644 index 0000000000..8360302971 --- /dev/null +++ b/ode/unit_test/Test_ODE_BDF.hpp @@ -0,0 +1,830 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_BDF.hpp" + +namespace Test { + +// Logistic equation +// Used to model population growth +// it is a simple nonlinear ODE with +// a lot of literature. +// +// Equation: y'(t) = r*y(t)*(1-y(t)/K) +// Jacobian: df/dy = r - 2*r*y/K +// Solution: y = K / (1 + ((K - y0) / y0)*exp(-rt)) +struct Logistic { + static constexpr int neqs = 1; + + const double r, K; + + Logistic(double r_, double K_) : r(r_), K(K_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = r * y(0) * (1.0 - y(0) / K); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = r - 2 * r * y(0) / K; + } + + template + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, + const vec_type& y) const { + y(0) = K / (1 + (K - y0) / y0 * Kokkos::exp(-r * t)); + } + +}; // Logistic + +// Lotka-Volterra equation +// A predator-prey model that describe +// population dynamics when two species +// interact. +// +// Equations: y0'(t) = alpha*y0(t) - beta*y0(t)*y1(t) +// y1'(t) = delta*y0(t)*y1(t) - gamma*y1(t) +// Jacobian: df0/dy = [alpha-beta*y1(t); beta*y0(t)] +// df1/dy = [delta*y1(t); delta*y0(t)-gamma] +// Solution: y = K / (1 + ((K - y0) / y0)*exp(-rt)) +struct LotkaVolterra { + static constexpr int neqs = 2; + + const double alpha, beta, delta, gamma; + + LotkaVolterra(double alpha_, double beta_, double delta_, double gamma_) + : alpha(alpha_), beta(beta_), delta(delta_), gamma(gamma_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = alpha * y(0) - beta * y(0) * y(1); + f(1) = delta * y(0) * y(1) - gamma * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = alpha - beta * y(1); + jac(0, 1) = -beta * y(0); + jac(1, 0) = delta * y(1); + jac(1, 1) = delta * y(0) - gamma; + } + +}; // LotkaVolterra + +// Robertson's autocatalytic chemical reaction: +// H. H. Robertson, The solution of a set of reaction rate equations, +// in J. Walsh (Ed.), Numerical Analysis: An Introduction, +// pp. 178–182, Academic Press, London (1966). +// +// Equations: y0' = -0.04*y0 + 10e4*y1*y2 +// y1' = 0.04*y0 - 10e4*y1*y2 - 3e7 * y1**2 +// y2' = 3e7 * y1**2 +struct StiffChemistry { + static constexpr int neqs = 3; + + StiffChemistry() {} + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); + f(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); + f(2) = 3.e7 * y(1) * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = -0.04; + jac(0, 1) = 1.e4 * y(2); + jac(0, 2) = 1.e4 * y(1); + jac(1, 0) = 0.04; + jac(1, 1) = -1.e4 * y(2) - 3.e7 * 2.0 * y(1); + jac(1, 2) = -1.e4 * y(1); + jac(2, 0) = 0.0; + jac(2, 1) = 3.e7 * 2.0 * y(1); + jac(2, 2) = 0.0; + } +}; + +template +struct BDFSolve_wrapper { + ode_type my_ode; + scalar_type tstart, tend; + int num_steps; + vec_type y_old, y_new, rhs, update, scale; + mv_type y_vecs, kstack; + mat_type temp, jac; + + BDFSolve_wrapper(const ode_type& my_ode_, const scalar_type tstart_, + const scalar_type tend_, const int num_steps_, + const vec_type& y_old_, const vec_type& y_new_, + const vec_type& rhs_, const vec_type& update_, + const vec_type& scale_, const mv_type& y_vecs_, + const mv_type& kstack_, const mat_type& temp_, + const mat_type& jac_) + : my_ode(my_ode_), + tstart(tstart_), + tend(tend_), + num_steps(num_steps_), + y_old(y_old_), + y_new(y_new_), + rhs(rhs_), + update(update_), + scale(scale_), + y_vecs(y_vecs_), + kstack(kstack_), + temp(temp_), + jac(jac_) {} + + KOKKOS_FUNCTION + void operator()(const int /*idx*/) const { + KokkosODE::Experimental::BDF::Solve( + my_ode, tstart, tend, num_steps, y_old, y_new, rhs, update, scale, + y_vecs, kstack, temp, jac); + } +}; + +template +struct BDF_Solve_wrapper { + const ode_type my_ode; + const scalar_type t_start, t_end, dt, max_step; + const vec_type y0, y_new; + const mat_type temp, temp2; + + BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, + const scalar_type& t_end_, const scalar_type& dt_, + const scalar_type& max_step_, const vec_type& y0_, + const vec_type& y_new_, const mat_type& temp_, + const mat_type& temp2_) + : my_ode(my_ode_), + t_start(t_start_), + t_end(t_end_), + dt(dt_), + max_step(max_step_), + y0(y0_), + y_new(y_new_), + temp(temp_), + temp2(temp2_) {} + + KOKKOS_FUNCTION void operator()(const int) const { + KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, y0, + y_new, temp, temp2); + } +}; + +template +void test_BDF_Logistic() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using mat_type = Kokkos::View; + + Kokkos::RangePolicy myPolicy(0, 1); + Logistic mySys(1, 1); + + constexpr int num_tests = 7; + int num_steps[num_tests] = {512, 256, 128, 64, 32, 16, 8}; + double errors[num_tests] = {0}; + const scalar_type t_start = 0.0, t_end = 6.0; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + vec_type scale("scaling factors", mySys.neqs); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), + temp("temp storage", mySys.neqs, mySys.neqs + 4); + mv_type kstack("Startup RK vectors", 6, mySys.neqs); + + Kokkos::deep_copy(scale, 1); + + scalar_type measured_order; + + // Test BDF1 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF1 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 1); + + Kokkos::deep_copy(y0, 0.5); + Kokkos::deep_copy(y_vecs, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + EXPECT_NEAR_KK_REL(measured_order, 2.0, 0.15); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 2, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 2.0) / 2.0 + << std::endl; +#endif + + // Test BDF2 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF2 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 2); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + EXPECT_NEAR_KK_REL(measured_order, 4.0, 0.15); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 4, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 4.0) / 4.0 + << std::endl; +#endif + + // Test BDF3 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF3 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 3); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + EXPECT_NEAR_KK_REL(measured_order, 8.0, 0.15); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 8, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 8.0) / 8.0 + << std::endl; +#endif + + // Test BDF4 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF4 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 4); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 16, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 16.0) / 16.0 + << std::endl; +#endif + + // Test BDF5 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF5 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 5); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 32, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 32.0) / 32.0 + << std::endl; +#endif + +} // test_BDF_Logistic + +template +void test_BDF_LotkaVolterra() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using mat_type = Kokkos::View; + + LotkaVolterra mySys(1.1, 0.4, 0.1, 0.4); + + const scalar_type t_start = 0.0, t_end = 100.0; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + vec_type scale("scaling factors", mySys.neqs); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), + temp("temp storage", mySys.neqs, mySys.neqs + 4); + + Kokkos::deep_copy(scale, 1); + + // Test BDF5 + mv_type kstack("Startup RK vectors", 6, mySys.neqs); + mv_type y_vecs("history vectors", mySys.neqs, 5); + + Kokkos::deep_copy(y0, 10.0); + Kokkos::deep_copy(y_vecs, 10.0); + + Kokkos::RangePolicy myPolicy(0, 1); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, scale, + y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); +} + +template +void test_BDF_StiffChemistry() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using mat_type = Kokkos::View; + + StiffChemistry mySys{}; + + const scalar_type t_start = 0.0, t_end = 500.0; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + vec_type scale("scaling factors", mySys.neqs); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), + temp("temp storage", mySys.neqs, mySys.neqs + 4); + + Kokkos::deep_copy(scale, 1); + + // Test BDF5 + mv_type kstack("Startup RK vectors", 6, mySys.neqs); + mv_type y_vecs("history vectors", mySys.neqs, 5); + + auto y0_h = Kokkos::create_mirror_view(y0); + y0_h(0) = 1.0; + y0_h(1) = 0.0; + y0_h(2) = 0.0; + Kokkos::deep_copy(y0, y0_h); + Kokkos::deep_copy(y_vecs, 0.0); + + Kokkos::RangePolicy myPolicy(0, 1); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 110000, y0, y_new, rhs, update, + scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); +} + +// template +// struct BDFSolve_parallel { +// ode_type my_ode; +// scalar_type tstart, tend; +// int num_steps; +// vec_type y_old, y_new, rhs, update, scale; +// mv_type y_vecs, kstack; +// mat_type temp, jac; + +// BDFSolve_parallel(const ode_type& my_ode_, const scalar_type tstart_, +// const scalar_type tend_, const int num_steps_, +// const vec_type& y_old_, const vec_type& y_new_, +// const vec_type& rhs_, const vec_type& update_, +// const vec_type& scale_, +// const mv_type& y_vecs_, const mv_type& kstack_, +// const mat_type& temp_, const mat_type& jac_) +// : my_ode(my_ode_), +// tstart(tstart_), +// tend(tend_), +// num_steps(num_steps_), +// y_old(y_old_), +// y_new(y_new_), +// rhs(rhs_), +// update(update_), +// scale(scale_), +// y_vecs(y_vecs_), +// kstack(kstack_), +// temp(temp_), +// jac(jac_) {} + +// KOKKOS_FUNCTION +// void operator()(const int idx) const { +// auto local_y_old = Kokkos::subview( +// y_old, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_y_new = Kokkos::subview( +// y_new, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_rhs = Kokkos::subview( +// rhs, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_update = Kokkos::subview( +// update, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); + +// auto local_y_vecs = Kokkos::subview( +// y_vecs, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1)), +// Kokkos::ALL()); +// auto local_kstack = Kokkos::subview( +// kstack, Kokkos::ALL(), +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_temp = Kokkos::subview( +// temp, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1)), +// Kokkos::ALL()); +// auto local_jac = Kokkos::subview( +// jac, Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + +// 1)), Kokkos::ALL()); + +// KokkosODE::Experimental::BDF::Solve( +// my_ode, tstart, tend, num_steps, local_y_old, local_y_new, local_rhs, +// local_update, scale, local_y_vecs, local_kstack, local_temp, +// local_jac); +// } +// }; + +// template +// void test_BDF_parallel() { +// using execution_space = typename device_type::execution_space; +// using vec_type = Kokkos::View; +// using mv_type = Kokkos::View; +// using mat_type = Kokkos::View; + +// LotkaVolterra mySys(1.1, 0.4, 0.1, 0.4); +// constexpr int num_solves = 1000; + +// vec_type scale("scaling factors", mySys.neqs); +// Kokkos::deep_copy(scale, 1); + +// const scalar_type t_start = 0.0, t_end = 100.0; +// vec_type y0("initial conditions", mySys.neqs * num_solves), +// y_new("solution", mySys.neqs * num_solves); +// vec_type rhs("rhs", mySys.neqs * num_solves), +// update("update", mySys.neqs * num_solves); +// mat_type jac("jacobian", mySys.neqs * num_solves, mySys.neqs), +// temp("temp storage", mySys.neqs * num_solves, mySys.neqs + 4); + +// // Test BDF5 +// mv_type y_vecs("history vectors", mySys.neqs * num_solves, 5), +// kstack("Startup RK vectors", 6, mySys.neqs * num_solves); + +// Kokkos::deep_copy(y0, 10.0); +// Kokkos::deep_copy(y_vecs, 10.0); + +// Kokkos::RangePolicy myPolicy(0, num_solves); +// BDFSolve_parallel +// solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, +// scale, y_vecs, +// kstack, temp, jac); +// Kokkos::parallel_for(myPolicy, solve_wrapper); + +// Kokkos::fence(); +// } + +template +void compute_coeffs(const int order, const scalar_type factor, + const mat_type& coeffs) { + std::cout << "compute_coeffs" << std::endl; + + coeffs(0, 0) = 1.0; + for (int colIdx = 0; colIdx < order; ++colIdx) { + coeffs(0, colIdx + 1) = 1.0; + for (int rowIdx = 0; rowIdx < order; ++rowIdx) { + coeffs(rowIdx + 1, colIdx + 1) = + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * + coeffs(rowIdx, colIdx + 1); + } + } +} + +template +void update_D(const int order, const scalar_type factor, const mat_type& coeffs, + const mat_type& tempD, const mat_type& D) { + auto subD = + Kokkos::subview(D, Kokkos::pair(0, order + 1), Kokkos::ALL); + auto subTempD = + Kokkos::subview(tempD, Kokkos::pair(0, order + 1), Kokkos::ALL); + + compute_coeffs(order, factor, coeffs); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + std::cout << "SerialGemm" << std::endl; + KokkosBatched::SerialGemm< + KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, R, subD, 0.0, subTempD); + + compute_coeffs(order, 1.0, coeffs); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + std::cout << "SerialGemm" << std::endl; + KokkosBatched::SerialGemm< + KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, U, subTempD, 0.0, subD); +} + +template +void test_Nordsieck() { + using execution_space = Kokkos::HostSpace; + StiffChemistry mySys{}; + + Kokkos::View R("coeffs", 6, 6), + U("coeffs", 6, 6); + Kokkos::View D("D", 8, mySys.neqs), + tempD("tmp", 8, mySys.neqs); + int order = 1; + double factor = 0.8; + + constexpr double t_start = 0.0, t_end = 500.0; + int max_steps = 200000; + double dt = (t_end - t_start) / max_steps; + + auto y0 = Kokkos::subview(D, 0, Kokkos::ALL()); + auto f = Kokkos::subview(D, 1, Kokkos::ALL()); + y0(0) = 1.0; + + mySys.evaluate_function(0, 0, y0, f); + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + f(eqIdx) *= dt; + } + + compute_coeffs(order, factor, R); + compute_coeffs(order, 1.0, U); + + { + std::cout << "R: " << std::endl; + for (int i = 0; i < order + 1; ++i) { + std::cout << "{ "; + for (int j = 0; j < order + 1; ++j) { + std::cout << R(i, j) << ", "; + } + std::cout << "}" << std::endl; + } + } + + std::cout << "D before update:" << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" + << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" + << std::endl; + update_D(order, factor, R, tempD, D); + + std::cout << "D after update:" << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" + << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" + << std::endl; +} + +template +void test_adaptive_BDF() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + Logistic mySys(1, 1); + + constexpr double t_start = 0.0, t_end = 6.0, atol = 1.0e-6, rtol = 1.0e-4; + constexpr int num_steps = 512, max_newton_iters = 5; + int order = 1, num_equal_steps = 0; + double dt = (t_end - t_start) / num_steps; + double t = t_start; + + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), + temp2("buffer2", 6, 7); + + // Initial condition + Kokkos::deep_copy(y0, 0.5); + + // Initialize D + auto D = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(2, 10)); + D(0, 0) = y0(0); + mySys.evaluate_function(0, 0, y0, rhs); + D(0, 1) = dt * rhs(0); + Kokkos::deep_copy(rhs, 0); + + std::cout << "**********************\n" + << " Step 1\n" + << "**********************" << std::endl; + + std::cout << "Initial conditions" << std::endl; + std::cout << " y0=" << y0(0) << ", t=" << t << ", dt=" << dt << std::endl; + + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) + << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " + << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, + update, temp, temp2); + + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + y0(eqIdx) = y_new(eqIdx); + } + + std::cout << "**********************\n" + << " Step 2\n" + << "**********************" << std::endl; + + std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; + + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) + << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " + << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, + update, temp, temp2); + + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + y0(eqIdx) = y_new(eqIdx); + } + + std::cout << "**********************\n" + << " Step 3\n" + << "**********************" << std::endl; + + std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; + + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) + << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " + << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, + update, temp, temp2); + + std::cout << "Final t: " << t << ", y=" << y_new(0) << std::endl; + +} // test_adaptive_BDF() + +template +void test_adaptive_BDF_v2() { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + using KAT = Kokkos::ArithTraits; + + std::cout << "\n\n\nBDF_v2 test starting\n" << std::endl; + + Logistic mySys(1, 1); + + const scalar_type t_start = KAT::zero(), + t_end = 6 * KAT::one(); //, atol = 1.0e-6, rtol = 1.0e-4; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + Kokkos::deep_copy(y0, 0.5); + + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), + temp2("buffer2", 6, 7); + + { + scalar_type dt = KAT::zero(); + vec_type f0("initial value f", mySys.neqs); + mySys.evaluate_function(t_start, KAT::zero(), y0, f0); + KokkosODE::Impl::initial_step_size(mySys, 1, t_start, 1e-6, 1e-3, y0, f0, + temp, dt); + + std::cout << "Initial Step Size: dt=" << dt << std::endl; + } + + KokkosODE::Experimental::BDFSolve(mySys, t_start, t_end, 0.0117188, + (t_end - t_start) / 10, y0, y_new, temp, + temp2); +} + +template +void test_BDF_adaptive_stiff() { + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + using KAT = Kokkos::ArithTraits; + + StiffChemistry mySys{}; + + const scalar_type t_start = KAT::zero(), t_end = 350 * KAT::one(); + scalar_type dt = KAT::zero(); + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + + // Set initial conditions + auto y0_h = Kokkos::create_mirror_view(y0); + y0_h(0) = KAT::one(); + y0_h(1) = KAT::zero(); + y0_h(2) = KAT::zero(); + Kokkos::deep_copy(y0, y0_h); + + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), + temp2("buffer2", 6, 7); + + Kokkos::RangePolicy policy(0, 1); + BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, + (t_end - t_start) / 10, y0, y_new, temp, temp2); + + Kokkos::parallel_for(policy, bdf_wrapper); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "Stiff Chemistry solution at t=500: {" << y_new_h(0) << ", " + << y_new_h(1) << ", " << y_new_h(2) << "}" << std::endl; +} + +} // namespace Test + +TEST_F(TestCategory, BDF_Logistic_serial) { + ::Test::test_BDF_Logistic(); +} +TEST_F(TestCategory, BDF_LotkaVolterra_serial) { + ::Test::test_BDF_LotkaVolterra(); +} +TEST_F(TestCategory, BDF_StiffChemistry_serial) { + ::Test::test_BDF_StiffChemistry(); +} +// TEST_F(TestCategory, BDF_parallel_serial) { +// ::Test::test_BDF_parallel(); +// } +TEST_F(TestCategory, BDF_Nordsieck) { + ::Test::test_Nordsieck(); +} +// TEST_F(TestCategory, BDF_adaptive) { +// ::Test::test_adaptive_BDF(); +// ::Test::test_adaptive_BDF_v2(); +// } +TEST_F(TestCategory, BDF_StiffChemistry_adaptive) { + ::Test::test_BDF_adaptive_stiff(); +} diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp index d235df1e56..45dd4adb6a 100644 --- a/ode/unit_test/Test_ODE_Newton.hpp +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -21,7 +21,8 @@ namespace Test { -template +template struct NewtonSolve_wrapper { using newton_params = KokkosODE::Experimental::Newton_params; @@ -32,10 +33,13 @@ struct NewtonSolve_wrapper { mat_type J, tmp; status_view status; + scale_type scale; + NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, const vec_type& x_, const vec_type& rhs_, const vec_type& update_, const mat_type& J_, - const mat_type& tmp_, const status_view& status_) + const mat_type& tmp_, const status_view& status_, + const scale_type& scale_) : my_nls(my_nls_), params(params_), x(x_), @@ -43,7 +47,8 @@ struct NewtonSolve_wrapper { update(update_), J(J_), tmp(tmp_), - status(status_) {} + status(status_), + scale(scale_) {} KOKKOS_FUNCTION void operator()(const int idx) const { @@ -71,7 +76,8 @@ struct NewtonSolve_wrapper { // Run Newton nonlinear solver status(idx) = KokkosODE::Experimental::Newton::Solve( - my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update); + my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update, + scale); } }; @@ -87,6 +93,9 @@ void run_newton_test(const system_type& mySys, Kokkos::View status("Newton status", 1); + vec_type scale("scaling factors", mySys.neqs); + Kokkos::deep_copy(scale, 1); + vec_type x("solution vector", mySys.neqs), rhs("right hand side vector", mySys.neqs); auto x_h = Kokkos::create_mirror_view(x); @@ -104,7 +113,7 @@ void run_newton_test(const system_type& mySys, Kokkos::RangePolicy my_policy(0, 1); NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status); + status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); @@ -205,6 +214,9 @@ void test_newton_status() { using vec_type = typename Kokkos::View; using mat_type = typename Kokkos::View; + vec_type scale("scaling factors", 1); + Kokkos::deep_copy(scale, 1); + double abs_tol, rel_tol; if (std::is_same_v) { rel_tol = 10e-5; @@ -227,7 +239,7 @@ void test_newton_status() { scalar_type solution[3] = {2.0, -1.0, 0.0}; #endif newton_solver_status newton_status[3] = { - newton_solver_status::NLS_SUCCESS, newton_solver_status::MAX_ITER, + newton_solver_status::NLS_SUCCESS, newton_solver_status::NLS_DIVERGENCE, newton_solver_status::LIN_SOLVE_FAIL}; vec_type x("solution vector", 1), rhs("right hand side vector", 1); auto x_h = Kokkos::create_mirror_view(x); @@ -242,7 +254,7 @@ void test_newton_status() { Kokkos::RangePolicy my_policy(0, 1); NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, - status); + status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(status_h, status); @@ -481,6 +493,9 @@ void test_newton_on_device() { system_type mySys{}; + vec_type scale("scaling factors", mySys.neqs); + Kokkos::deep_copy(scale, 1); + vec_type x("solution vector", mySys.neqs * num_systems); vec_type rhs("right hand side vector", mySys.neqs * num_systems); vec_type update("update", mySys.neqs * num_systems); @@ -503,7 +518,7 @@ void test_newton_on_device() { Kokkos::RangePolicy my_policy(0, num_systems); NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status); + status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::fence(); diff --git a/perf_test/ode/CMakeLists.txt b/perf_test/ode/CMakeLists.txt index b4aa86889f..39acabed98 100644 --- a/perf_test/ode/CMakeLists.txt +++ b/perf_test/ode/CMakeLists.txt @@ -5,4 +5,8 @@ if(KOKKOSKERNELS_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( ode_runge_kutta SOURCES KokkosODE_RK.cpp ) + + KOKKOSKERNELS_ADD_BENCHMARK( + ode_bdf_solver SOURCES KokkosODE_BDF.cpp + ) endif() diff --git a/perf_test/ode/KokkosODE_BDF.cpp b/perf_test/ode/KokkosODE_BDF.cpp new file mode 100644 index 0000000000..84a310666f --- /dev/null +++ b/perf_test/ode/KokkosODE_BDF.cpp @@ -0,0 +1,266 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosODE_BDF.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include +#include "Benchmark_Context.hpp" + +namespace { +// Robertson's autocatalytic chemical reaction: +// H. H. Robertson, The solution of a set of reaction rate equations, +// in J. Walsh (Ed.), Numerical Analysis: An Introduction, +// pp. 178–182, Academic Press, London (1966). +// +// Equations: y0' = -0.04*y0 + 10e4*y1*y2 +// y1' = 0.04*y0 - 10e4*y1*y2 - 3e7 * y1**2 +// y2' = 3e7 * y1**2 +struct StiffChemistry { + static constexpr int neqs = 3; + + StiffChemistry() {} + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); + f(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); + f(2) = 3.e7 * y(1) * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = -0.04; + jac(0, 1) = 1.e4 * y(2); + jac(0, 2) = 1.e4 * y(1); + jac(1, 0) = 0.04; + jac(1, 1) = -1.e4 * y(2) - 3.e7 * 2.0 * y(1); + jac(1, 2) = -1.e4 * y(1); + jac(2, 0) = 0.0; + jac(2, 1) = 3.e7 * 2.0 * y(1); + jac(2, 2) = 0.0; + } +}; + +template +struct BDF_Solve_wrapper { + const ode_type my_ode; + const scalar_type t_start, t_end, dt, max_step; + const vec_type y0, y_new; + const mat_type temp, temp2; + + BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, + const scalar_type& t_end_, const scalar_type& dt_, + const scalar_type& max_step_, const vec_type& y0_, + const vec_type& y_new_, const mat_type& temp_, + const mat_type& temp2_) + : my_ode(my_ode_), + t_start(t_start_), + t_end(t_end_), + dt(dt_), + max_step(max_step_), + y0(y0_), + y_new(y_new_), + temp(temp_), + temp2(temp2_) {} + + KOKKOS_FUNCTION void operator()(const int idx) const { + auto subTemp = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::ALL(), idx); + auto subTemp2 = Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::ALL(), idx); + auto subY0 = Kokkos::subview(y0, Kokkos::ALL(), idx); + auto subYnew = Kokkos::subview(y_new, Kokkos::ALL(), idx); + + KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, + subY0, subYnew, subTemp, subTemp2); + } +}; + +} // namespace + +struct bdf_input_parameters { + int num_odes; + int repeat; + bool verbose; + + bdf_input_parameters(const int num_odes_, const int repeat_, + const bool verbose_) + : num_odes(num_odes_), repeat(repeat_), verbose(verbose_){}; +}; + +template +void run_ode_chem(benchmark::State& state, const bdf_input_parameters& inputs) { + using scalar_type = double; + using KAT = Kokkos::ArithTraits; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + StiffChemistry mySys{}; + + const bool verbose = inputs.verbose; + const int num_odes = inputs.num_odes; + const int neqs = mySys.neqs; + + const scalar_type t_start = KAT::zero(), t_end = 350 * KAT::one(); + scalar_type dt = KAT::zero(); + vec_type y0("initial conditions", neqs, num_odes); + vec_type y_new("solution", neqs, num_odes); + + // Set initial conditions + auto y0_h = Kokkos::create_mirror(y0); + for (int sysIdx = 0; sysIdx < num_odes; ++sysIdx) { + y0_h(0, sysIdx) = KAT::one(); + y0_h(1, sysIdx) = KAT::zero(); + y0_h(2, sysIdx) = KAT::zero(); + } + + mat_type temp("buffer1", neqs, 23 + 2 * neqs + 4, num_odes), + temp2("buffer2", 6, 7, num_odes); + + if (verbose) { + std::cout << "Number of problems solved in parallel: " << num_odes + << std::endl; + } + + Kokkos::RangePolicy policy(0, num_odes); + + Kokkos::Timer time; + time.reset(); + for (auto _ : state) { + (void)_; + + // Set initial conditions for each test iteration + state.PauseTiming(); + dt = KAT::zero(); + Kokkos::deep_copy(y0, y0_h); + Kokkos::deep_copy(y_new, KAT::zero()); + Kokkos::deep_copy(temp, KAT::zero()); + Kokkos::deep_copy(temp2, KAT::zero()); + BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, + (t_end - t_start) / 10, y0, y_new, temp, + temp2); + state.ResumeTiming(); + + // Actually run the time integrator + Kokkos::parallel_for(policy, bdf_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + std::cout << "Run time: " << run_time << std::endl; + + Kokkos::deep_copy(y0_h, y0); + double error; + for (int odeIdx = 0; odeIdx < num_odes; ++odeIdx) { + error = 0; + // error += Kokkos::abs(y0_h(0, odeIdx) - 0.4193639) / 0.4193639; + // error += Kokkos::abs(y0_h(1, odeIdx) - 0.000002843646) / 0.000002843646; + // error += Kokkos::abs(y0_h(2, odeIdx) - 0.5806333) / 0.5806333; + error += Kokkos::abs(y0_h(0, odeIdx) - 0.462966) / 0.462966; + error += Kokkos::abs(y0_h(1, odeIdx) - 3.42699e-06) / 3.42699e-06; + error += Kokkos::abs(y0_h(2, odeIdx) - 0.537030) / 0.537030; + error = error / 3; + + if (error > 1e-6) { + std::cout << "Large error in problem " << odeIdx << ": " << error + << std::endl; + } + } +} + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --repeat :: how many times to repeat overall test" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\t[Optional] --n :: number of ode problems to solve" + << std::endl; +} // print_options + +int parse_inputs(bdf_input_parameters& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--n", params.num_odes)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +template +void run_benchmark_wrapper(benchmark::State& state, + bdf_input_parameters params) { + run_ode_chem(state, params); +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + std::string bench_name = "KokkosODE_BDF_Stiff_Chem"; + bdf_input_parameters params(1000, 1, false); + parse_inputs(params, argc, argv); + + if (0 < common_params.repeat) { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, params) + ->UseRealTime() + ->ArgNames({"n"}) + ->Args({params.num_odes}) + ->Iterations(common_params.repeat); + } else { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, params) + ->UseRealTime() + ->ArgNames({"n"}) + ->Args({params.num_odes}); + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + } + Kokkos::finalize(); + + return 0; +} From a19435c50dc06e3d65d82ac4994cb01c2172a38d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 13 Mar 2024 11:33:26 -0600 Subject: [PATCH 184/326] cm_test_all_sandia: update caraway compilers add rocm/5.6.1 and rocm/6.0.0, and openblas/0.3.23 as tpl --- scripts/cm_test_all_sandia | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 95ce5c1f62..9296ea3d57 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -667,6 +667,7 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then BASE_MODULE_LIST="cmake,/" ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" + ROCM_TPL_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.23" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" @@ -674,12 +675,15 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then if [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/5.6.0 $ROCM520_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "rocm/5.6.1 $ROCM_TPL_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "rocm/6.0.0 $ROCM_TPL_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/5.6.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/5.6.1 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "rocm/6.0.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" From 4aa0ebd2a3f842b2e1d56cbb035a96ebe95218f1 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 13 Mar 2024 13:34:18 -0600 Subject: [PATCH 185/326] Sparse MKL: changing the location of the MKL_SAFE_CALL macro (#2134) * Sparse MKL: changing the location of the MKL_SAFE_CALL macro Moving the macro outside of namespaces to ensure that it will be interpreted correctly when called from any other location in the library. It does not make much sense to guard Impl code in the Experimental namespace and in this case it cleans up a problem with namespace disambiguation for the compiler... * Sparse BsrSpMV: removing Experimental namespace from Impl namespace * Applying clang-format * Sparse SpMV: fixing more namespace issues! --- ...Sparse_spmv_bsrmatrix_eti_spec_inst.cpp.in | 4 +--- ...rse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in | 2 -- ...parse_spmv_bsrmatrix_eti_spec_avail.hpp.in | 2 -- ...se_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in | 2 -- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 5 ----- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 14 +++++-------- sparse/src/KokkosSparse_Utils_mkl.hpp | 6 ++++++ sparse/src/KokkosSparse_spmv.hpp | 21 ++++++++++--------- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 2 -- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 8 ------- 10 files changed, 23 insertions(+), 43 deletions(-) diff --git a/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in b/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in index 9895083764..077150f36c 100644 --- a/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in +++ b/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in @@ -19,11 +19,9 @@ #include "KokkosSparse_spmv_bsrmatrix_spec.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_BSRMATRIX_ETI_INST_BLOCK@ // clang-format on } // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse \ No newline at end of file +} // namespace KokkosSparse diff --git a/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in b/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in index d089eca0e3..2c9a6083bf 100644 --- a/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in +++ b/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in @@ -19,11 +19,9 @@ #include "KokkosSparse_spmv_bsrmatrix_spec.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_MV_BSRMATRIX_ETI_INST_BLOCK@ /// // clang-format on } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in index f98e60ae0d..278b60a813 100644 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in @@ -17,12 +17,10 @@ #ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_BSRMATRIX_ETI_AVAIL_BLOCK@ // clang-format on } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in index df53928266..3247985f4c 100644 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in @@ -18,12 +18,10 @@ #define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_MV_BSRMATRIX_ETI_AVAIL_BLOCK@ // clang-format on } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 1c2e4f80e9..85e27f1b1b 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -27,7 +27,6 @@ #include namespace KokkosSparse { -namespace Experimental { namespace Impl { struct BsrMatrixSpMVTensorCoreFunctorParams { @@ -519,7 +518,6 @@ struct BsrMatrixSpMVTensorCoreDispatcher { }; } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // #if CUDA && (VOLTA || AMPERE) @@ -537,7 +535,6 @@ struct BsrMatrixSpMVTensorCoreDispatcher { #include "KokkosKernels_ExecSpaceUtils.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { namespace Bsr { @@ -1735,9 +1732,7 @@ void spMatMultiVec_transpose(const execution_space &exec, Handle *handle, /* ******************* */ } // namespace Bsr - } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_ diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index cde7fc1461..5ec3cdb50b 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -29,7 +29,6 @@ #endif namespace KokkosSparse { -namespace Experimental { namespace Impl { // default is no eti available @@ -47,10 +46,6 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; -} // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse - #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL( \ SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ @@ -97,13 +92,15 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { enum : bool { value = true }; \ }; +} // namespace Impl +} // namespace KokkosSparse + // Include which ETIs are available #include #include #include namespace KokkosSparse { -namespace Experimental { namespace Impl { // declaration @@ -218,8 +215,8 @@ struct SPMV_MV_BSRMATRIXalgo == SPMV_BSR_TC) method = Method::TensorCores; - if (!KokkosSparse::Experimental::Impl::TensorCoresAvailable< - ExecutionSpace, AMatrix, XVector, YVector>::value) { + if (!KokkosSparse::Impl::TensorCoresAvailable::value) { method = Method::Fallback; } // can't use tensor cores unless mode is no-transpose @@ -365,7 +362,6 @@ struct SPMV_MV_BSRMATRIX::name() + "]"; Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_BSRMATRIX< - ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, - YVector_Internal, false>::spmv_bsrmatrix(space, handle, mode, alpha, - A_i, x_i, beta, y_i); + Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, handle, mode, alpha, + A_i, x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { - Experimental::Impl::SPMV_BSRMATRIX< - ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, - YVector_Internal>::spmv_bsrmatrix(space, handle, mode, alpha, A_i, - x_i, beta, y_i); + Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, handle, + mode, alpha, A_i, + x_i, beta, y_i); } } else { ///////////////// @@ -402,7 +403,7 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], typename AMatrix_Internal::non_const_value_type>::name() + "]"; Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_MV_BSRMATRIX< + Impl::SPMV_MV_BSRMATRIX< ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral< @@ -411,7 +412,7 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], beta, y_i); Kokkos::Profiling::popRegion(); } else { - Experimental::Impl::SPMV_MV_BSRMATRIX< + Impl::SPMV_MV_BSRMATRIX< ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral:: diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 3a68ba348e..16bf1abecf 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -22,7 +22,6 @@ #endif namespace KokkosSparse { -namespace Experimental { namespace Impl { // Specialization struct which defines whether a specialization exists template , #endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 875913214c..c1d4744920 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -26,13 +26,10 @@ #include namespace KokkosSparse { -namespace Experimental { namespace Impl { // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -using KokkosSparse::Impl::mode_kk_to_mkl; - // Note: Scalar here is the Kokkos type, not the MKL type template inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, @@ -338,7 +335,6 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, #undef KOKKOSSPARSE_SPMV_MV_MKL } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && (__INTEL_MKL__ > 2017) @@ -359,7 +355,6 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, #include "KokkosSparse_Utils_cusparse.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { template @@ -765,7 +760,6 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // (9000 <= CUDA_VERSION) @@ -781,7 +775,6 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, #include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { template @@ -1101,7 +1094,6 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, #undef KOKKOSSPARSE_SPMV_ROCSPARSE } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) From a29e0e8f9d34f7cd3dc03413e9415a3b9693aabf Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Tue, 12 Mar 2024 18:31:43 -0500 Subject: [PATCH 186/326] Fixing missing descriptor for bsr spmv --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index c1d4744920..2ce45b4cb1 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -953,17 +953,20 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( - rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, - bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( - rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, - bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( - rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, - bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); From 3a5498d4353559b17e0712fe68241d6cf3de745a Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 14 Mar 2024 08:51:57 -0600 Subject: [PATCH 187/326] Kokkos Kernels: change the default offset ETI from size_t to int (#2140) This change makes it easier for customer to leverage TPL support which almost always requires offset=int, ordinal=int to be enabled meaning that no TPL support is available with our default ETI... --- cmake/kokkoskernels_eti_offsets.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/kokkoskernels_eti_offsets.cmake b/cmake/kokkoskernels_eti_offsets.cmake index 484175a976..1cf02f1327 100644 --- a/cmake/kokkoskernels_eti_offsets.cmake +++ b/cmake/kokkoskernels_eti_offsets.cmake @@ -1,5 +1,5 @@ -SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) -SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT OFF) +SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT OFF) +SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) SET(OFFSETS OFFSET_INT OFFSET_SIZE_T From 5b08244cb95a9633565f90700cfa2f7a4cadd734 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 14 Mar 2024 10:32:58 -0600 Subject: [PATCH 188/326] KokkosSparse_spmv_bsrmatrix_spec: fix Bsr_TC_Precision namespacing Resolve compilation errors in nightly cuda/12.2 A100 build --- sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 5ec3cdb50b..a5d0b99a23 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -243,21 +243,21 @@ struct SPMV_MV_BSRMATRIXbsr_tc_precision; switch (precision) { - case Bsr_TC_Precision::Mixed: { + case KokkosSparse::Experimental::Bsr_TC_Precision::Mixed: { BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, beta, Y); return; } - case Bsr_TC_Precision::Double: { + case KokkosSparse::Experimental::Bsr_TC_Precision::Double: { BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, beta, Y); return; } - case Bsr_TC_Precision::Automatic: // fallthrough + case KokkosSparse::Experimental::Bsr_TC_Precision::Automatic: // fallthrough default: { constexpr bool operandsHalfHalfFloat = std::is_same::value && From 98d37b5a8f69a1dc28ec3bbf07f445c4c8e0edf7 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 14 Mar 2024 10:41:43 -0600 Subject: [PATCH 189/326] Drop comment for cleaner clang-format fix --- sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index a5d0b99a23..dddf6e1472 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -257,7 +257,7 @@ struct SPMV_MV_BSRMATRIX::value && From f492f59460312445abed2b7d0f7a0d19198a0798 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 14 Mar 2024 16:16:22 -0400 Subject: [PATCH 190/326] Fix usage of RAII to set cusparse/rocsparse stream (#2141) Temporary objects like "A()" get destructed immediately. For the object to have scope lifetime, it needs a name like "A a();". This was causing cusparse/rocsparse spmv to always execute on the default stream, causing incorrect timing in the spmv perf test. --- sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 4 ++-- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 2 +- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 2ce45b4cb1..9c844ff910 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -372,7 +372,7 @@ void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, cusparseHandle_t cusparseHandle = KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); + KokkosSparse::Impl::TemporarySetCusparseStream tscs(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -487,7 +487,7 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, cusparseHandle_t cusparseHandle = KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); + KokkosSparse::Impl::TemporarySetCusparseStream tscs(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 1de91cdf27..45719def45 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -111,7 +111,7 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, cusparseHandle_t cusparseHandle = KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - TemporarySetCusparseStream(cusparseHandle, exec); + TemporarySetCusparseStream tscs(cusparseHandle, exec); /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ const cusparseIndexType_t myCusparseOffsetType = diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 1c589b2330..926d201a52 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -41,7 +41,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], cusparseHandle_t cusparseHandle = KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - TemporarySetCusparseStream(cusparseHandle, exec); + TemporarySetCusparseStream tscs(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -389,7 +389,7 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], rocsparse_handle rocsparseHandle = KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; /* Set rocsparse to use the given stream until this function exits */ - TemporarySetRocsparseStream(rocsparseHandle, exec); + TemporarySetRocsparseStream tsrs(rocsparseHandle, exec); /* Set the operation mode */ rocsparse_operation myRocsparseOperation = mode_kk_to_rocsparse(mode); From 2f66110c7ad9e3fd10a2638e9b557b5226c3a85e Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 14 Mar 2024 18:50:41 -0400 Subject: [PATCH 191/326] Use execution space operator== (#2136) It actually is part of the public interface --- sparse/src/KokkosSparse_spmv_handle.hpp | 33 +------------------------ 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index 2ce32a1f98..9e7295c72c 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -82,37 +82,6 @@ inline bool is_spmv_algorithm_native(SPMVAlgorithm a) { } namespace Impl { -// Execution spaces do not support operator== in public interface, even though -// in practice the major async/GPU spaces do have the feature. -// This is a conservative check for whether e1 and e2 are known to be the -// same. If it cannot be determined, assume they are different. -template -inline bool exec_spaces_same(const ExecutionSpace&, const ExecutionSpace&) { - return false; -} - -#ifdef KOKKOS_ENABLE_CUDA -template <> -inline bool exec_spaces_same(const Kokkos::Cuda& e1, - const Kokkos::Cuda& e2) { - return e1.impl_internal_space_instance() == e2.impl_internal_space_instance(); -} -#endif -#ifdef KOKKOS_ENABLE_HIP -template <> -inline bool exec_spaces_same(const Kokkos::HIP& e1, - const Kokkos::HIP& e2) { - return e1.impl_internal_space_instance() == e2.impl_internal_space_instance(); -} -#endif -#ifdef KOKKOS_ENABLE_SYCL -template <> -inline bool exec_spaces_same( - const Kokkos::Experimental::SYCL& e1, - const Kokkos::Experimental::SYCL& e2) { - return e1.impl_internal_space_instance() == e2.impl_internal_space_instance(); -} -#endif template struct TPL_SpMV_Data { @@ -124,7 +93,7 @@ struct TPL_SpMV_Data { // If it is, fence the old exec now. // That way, SPMVHandle cleanup doesn't need // to worry about resources still being in use on the old exec. - if (!exec_spaces_same(exec, new_exec)) { + if (exec != new_exec) { exec.fence(); exec = new_exec; } From acd71413dff0878fc2b675f6b20c1c3eaa9ef927 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 15 Mar 2024 07:41:45 -0600 Subject: [PATCH 192/326] cm_test_all_sandia: more caraway module updates and cleanup (#2145) --- scripts/cm_test_all_sandia | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 9296ea3d57..eb296091af 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -634,8 +634,8 @@ elif [ "$MACHINE" = "caraway" ]; then # report_and_log_test_result: only testing compilation of code for now, # output description and success based only on build succes; build time output (no run-time) - BASE_MODULE_LIST="cmake/3.19.3,/" - ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" + BASE_MODULE_LIST="cmake,/" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" @@ -647,10 +647,7 @@ elif [ "$MACHINE" = "caraway" ]; then else # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) fi @@ -666,7 +663,7 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then # output description and success based only on build succes; build time output (no run-time) BASE_MODULE_LIST="cmake,/" - ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20" ROCM_TPL_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.23" HIPCLANG_BUILD_LIST="Hip_Serial" @@ -674,20 +671,15 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then if [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/5.6.0 $ROCM520_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "rocm/5.6.1 $ROCM_TPL_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + COMPILERS=("rocm/5.6.1 $ROCM_TPL_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/6.0.0 $ROCM_TPL_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "rocm/5.6.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/5.6.1 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/6.0.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) fi From 0c49c2100bf0b17418add7e59d2d60d997666f3f Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Fri, 15 Mar 2024 11:46:11 -0400 Subject: [PATCH 193/326] Spmv perftest improvements (#2146) * Spmv perf test improvements - Add option to flush caches by filling a dummy buffer between iterations - Add option to call the non-reuse interface instead of handle/reuse interface - Fix modes T, H in nonsquare case (make x,y the correct length) * Fix mode help text --- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 279 ++++++++++++++-------- 1 file changed, 178 insertions(+), 101 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 1024356f7b..194ee9afd4 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -28,96 +28,159 @@ #include #include #include +#include // for graph_max_degree #include #include "KokkosKernels_default_types.hpp" -typedef default_scalar Scalar; -typedef default_lno_t Ordinal; -typedef default_size_type Offset; - -template -void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, - int num_vecs, char mode, Scalar beta) { - using matrix_type = - KokkosSparse::CrsMatrix; - using mv_type = Kokkos::View; - using h_mv_type = typename mv_type::HostMirror; - - srand(17312837); - matrix_type A; - if (filename) - A = KokkosSparse::Impl::read_kokkos_crst_matrix(filename); - else { - Offset nnz = 10 * numRows; - // note: the help text says the bandwidth is fixed at 0.01 * numRows - A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, 0, 0.01 * numRows); - } - numRows = A.numRows(); - numCols = A.numCols(); - - std::cout << "A is " << numRows << "x" << numCols << ", with " << A.nnz() - << " nonzeros\n"; - std::cout << "SpMV mode " << mode << ", " << num_vecs - << " vectors, beta = " << beta << ", multivectors are "; - std::cout << (std::is_same_v ? "LayoutLeft" - : "LayoutRight"); - std::cout << '\n'; - - mv_type x("X", numCols, num_vecs); - mv_type y("Y", numRows, num_vecs); - - h_mv_type h_x = Kokkos::create_mirror_view(x); - h_mv_type h_y = Kokkos::create_mirror_view(y); - h_mv_type h_y_compare = Kokkos::create_mirror(y); - - for (int v = 0; v < num_vecs; v++) { - for (int i = 0; i < numCols; i++) { - h_x(i, v) = (Scalar)(1.0 * (rand() % 40) - 20.); - } - } +using Scalar = default_scalar; +using Ordinal = default_lno_t; +using Offset = default_size_type; +using KAT = Kokkos::ArithTraits; + +struct SPMVBenchmarking { + // note: CLI currently only allows square matrices to be randomly generated + // and nz/row is fixed at 10 + Ordinal num_rows = 110503; + Ordinal num_cols = 110503; + char mode = 'N'; + int loop = 100; + int num_vecs = 1; + Scalar beta = KAT::zero(); + std::string filename = ""; + bool flush_cache = false; + bool non_reuse = false; - Kokkos::deep_copy(x, h_x); - - // Benchmark - auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0); - auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0); - - // Create handles for both rank-1 and rank-2 cases, - // even though only 1 will get used below (depending on num_vecs) - - KokkosSparse::SPMVHandle - handle_rank1; - KokkosSparse::SPMVHandle - handle_rank2; - // Do 5 warm up calls (not timed). This will also initialize the handle. - for (int i = 0; i < 5; i++) { - if (num_vecs == 1) { - // run the rank-1 version - KokkosSparse::spmv(&handle_rank1, &mode, 1.0, A, x0, beta, y0); + // Using the parameters above, run and time spmv where x and y use the given + // memory layout. + template + void run() { + using matrix_type = + KokkosSparse::CrsMatrix; + using mv_type = Kokkos::View; + using h_mv_type = typename mv_type::HostMirror; + + srand(17312837); + matrix_type A; + if (filename != "") { + std::cout << "Reading A from file \"" << filename << "\"...\n"; + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + filename.c_str()); + num_rows = A.numRows(); + num_cols = A.numCols(); } else { - // rank-2 - KokkosSparse::spmv(&handle_rank2, &mode, 1.0, A, x, beta, y); + std::cout << "Randomly generating A...\n"; + Offset nnz = 10 * num_rows; + // note: the help text says the bandwidth is fixed at 0.01 * numRows + A = KokkosSparse::Impl::kk_generate_sparse_matrix( + num_rows, num_cols, nnz, 0, 0.01 * num_rows); } - Kokkos::DefaultExecutionSpace().fence(); - } - Kokkos::Timer timer; - for (int i = 0; i < loop; i++) { - if (num_vecs == 1) { - // run the rank-1 version - KokkosSparse::spmv(&handle_rank1, &mode, 1.0, A, x0, beta, y0); - } else { - // rank-2 - KokkosSparse::spmv(&handle_rank2, &mode, 1.0, A, x, beta, y); + + std::cout << "A is " << A.numRows() << "x" << A.numCols() << ", with " + << A.nnz() << " nonzeros\n"; + std::cout << "Mean nnz/row: " << (double)A.nnz() / A.numRows() << '\n'; + std::cout << "Max nnz/row: " + << KokkosSparse::Impl::graph_max_degree< + Kokkos::DefaultExecutionSpace, Ordinal>(A.graph.row_map) + << '\n'; + std::cout << "SpMV mode " << mode << ", " << num_vecs + << " vectors, beta = " << beta << ", multivectors are "; + std::cout << (std::is_same_v ? "LayoutLeft" + : "LayoutRight"); + std::cout << '\n'; + + bool transpose_like = (mode == 'T') || (mode == 'H'); + + Ordinal xlen = transpose_like ? A.numRows() : A.numCols(); + Ordinal ylen = transpose_like ? A.numCols() : A.numRows(); + + mv_type x("X", xlen, num_vecs); + mv_type y("Y", ylen, num_vecs); + + h_mv_type h_x = Kokkos::create_mirror_view(x); + h_mv_type h_y = Kokkos::create_mirror_view(y); + h_mv_type h_y_compare = Kokkos::create_mirror(y); + + for (int v = 0; v < num_vecs; v++) { + for (Ordinal i = 0; i < xlen; i++) { + h_x(i, v) = (Scalar)(1.0 * (rand() % 40) - 20.); + } + } + + Kokkos::deep_copy(x, h_x); + + // Benchmark + auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0); + auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0); + + // Create handles for both rank-1 and rank-2 cases, + // even though only 1 will get used below (depending on num_vecs) + + KokkosSparse::SPMVHandle + handle_rank1; + KokkosSparse::SPMVHandle + handle_rank2; + // Assuming that 1GB is enough to fully clear the L3 cache of a CPU, or the + // L2 of a GPU. (Some AMD EPYC chips have 768 MB L3) + Kokkos::View cacheFlushData; + if (flush_cache) { + Kokkos::resize(cacheFlushData, 1024 * 1024 * 1024); + } + + Kokkos::DefaultExecutionSpace space; + + // Do 5 warm up calls (not timed). This will also initialize the handle. + for (int i = 0; i < 5; i++) { + if (num_vecs == 1) { + // run the rank-1 version + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x0, beta, y0); + else + KokkosSparse::spmv(space, &handle_rank1, &mode, 1.0, A, x0, beta, y0); + } else { + // rank-2 + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x, beta, y); + else + KokkosSparse::spmv(space, &handle_rank2, &mode, 1.0, A, x, beta, y); + } + space.fence(); + } + + double totalTime = 0; + Kokkos::Timer timer; + for (int i = 0; i < loop; i++) { + if (flush_cache) { + // Copy some non-zero data to the view multiple times to flush the + // cache. (nonzero in case the system has an optimized path for zero + // pages) + for (int rep = 0; rep < 4; rep++) + Kokkos::deep_copy(space, cacheFlushData, char(rep + 1)); + } + space.fence(); + timer.reset(); + if (num_vecs == 1) { + // run the rank-1 version + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x0, beta, y0); + else + KokkosSparse::spmv(space, &handle_rank1, &mode, 1.0, A, x0, beta, y0); + } else { + // rank-2 + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x, beta, y); + else + KokkosSparse::spmv(space, &handle_rank2, &mode, 1.0, A, x, beta, y); + } + space.fence(); + totalTime += timer.seconds(); } - Kokkos::DefaultExecutionSpace().fence(); + double avg_time = totalTime / loop; + std::cout << avg_time << " s\n"; } - double avg_time = timer.seconds() / loop; - std::cout << avg_time << " s\n"; -} +}; void print_help() { printf(" -s [nrows] : matrix dimension (square)\n"); @@ -128,8 +191,11 @@ void print_help() { " --layout left|right : memory layout of x/y. Default depends on " "build's default execution space\n"); printf( - " -m N|T : matrix apply mode: N (normal, default), T " - "(transpose)\n"); + " -m N|T|H|C : matrix apply mode:\n" + " N - normal, default\n" + " T - transpose\n" + " H - conjugate transpose\n" + " C - conjugate\n"); printf( " -f [file],-fb [file] : Read in Matrix Market (.mtx), or binary " "(.bin) matrix file.\n"); @@ -137,21 +203,21 @@ void print_help() { " -l [LOOP] : How many spmv to run to aggregate average " "time. \n"); printf(" -b beta : beta, as in y := Ax + (beta)y\n"); + printf( + " --flush : Flush the cache between each spmv call " + "(slow!)\n"); + printf( + " --non-reuse : Use non-reuse interface (without " + "SPMVHandle)\n"); } int main(int argc, char** argv) { - long long int size = 110503; // a prime number - char* filename = NULL; - - char mode = 'N'; + SPMVBenchmarking sb; char layout; if (std::is_same::value) layout = 'L'; else layout = 'R'; - int loop = 100; - int num_vecs = 1; - Scalar beta = 0.0; if (argc == 1) { print_help(); @@ -160,27 +226,31 @@ int main(int argc, char** argv) { for (int i = 0; i < argc; i++) { if ((strcmp(argv[i], "-s") == 0)) { - size = atoi(argv[++i]); + // only square matrices supported now + sb.num_rows = atoi(argv[++i]); + sb.num_cols = sb.num_rows; continue; } if ((strcmp(argv[i], "-f") == 0 || strcmp(argv[i], "-fb") == 0)) { - filename = argv[++i]; + sb.filename = argv[++i]; continue; } if ((strcmp(argv[i], "-l") == 0)) { - loop = atoi(argv[++i]); + sb.loop = atoi(argv[++i]); continue; } if ((strcmp(argv[i], "-m") == 0)) { - mode = toupper(argv[++i][0]); + sb.mode = toupper(argv[++i][0]); + if (sb.mode != 'N' && sb.mode != 'T' && sb.mode != 'C' && sb.mode != 'H') + throw std::invalid_argument("Mode must be one of N, T, C or H."); continue; } if ((strcmp(argv[i], "--nv") == 0)) { - num_vecs = atoi(argv[++i]); + sb.num_vecs = atoi(argv[++i]); continue; } if ((strcmp(argv[i], "-b") == 0)) { - beta = atof(argv[++i]); + sb.beta = atof(argv[++i]); continue; } if ((strcmp(argv[i], "--layout") == 0)) { @@ -191,6 +261,15 @@ int main(int argc, char** argv) { layout = 'R'; else throw std::runtime_error("Invalid layout"); + continue; + } + if ((strcmp(argv[i], "--flush") == 0)) { + sb.flush_cache = true; + continue; + } + if ((strcmp(argv[i], "--non-reuse") == 0)) { + sb.non_reuse = true; + continue; } if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { print_help(); @@ -201,11 +280,9 @@ int main(int argc, char** argv) { Kokkos::initialize(argc, argv); if (layout == 'L') - run_spmv(size, size, filename, loop, num_vecs, mode, - beta); + sb.template run(); else - run_spmv(size, size, filename, loop, num_vecs, mode, - beta); + sb.template run(); Kokkos::finalize(); } From d84217552b2344f956232ce9da4ecedad8a11fea Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 18 Mar 2024 09:20:06 -0600 Subject: [PATCH 194/326] KokkosKernels Utils: cleaning the zero_vector interface One of the overload requires an unused template, removing that extraneous template and simplify how that function is called in a second overload. --- common/src/KokkosKernels_Utils.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index e1c15505ff..ba8049cecf 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -890,7 +890,7 @@ void permute_block_vector(typename idx_array_type::value_type num_elements, // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. -template +template void zero_vector(ExecSpaceIn &exec_space_in, typename value_array_type::value_type /* num_elements */, value_array_type &vector) { @@ -906,8 +906,7 @@ void zero_vector(typename value_array_type::value_type /* num_elements */, using ne_tmp_t = typename value_array_type::value_type; ne_tmp_t ne_tmp = ne_tmp_t(0); MyExecSpace my_exec_space; - zero_vector(my_exec_space, ne_tmp, - vector); + zero_vector(my_exec_space, ne_tmp, vector); } template From e88b1a12ab0550eb74ae3e0dd09f07c9fe1b5efb Mon Sep 17 00:00:00 2001 From: "Luc Berger-Vergiat (-EXP)" Date: Mon, 18 Mar 2024 17:21:19 -0600 Subject: [PATCH 195/326] Kokkos Kernels: fixing call to zero_vector in Gauss-Seidel --- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 7391e00e3d..02626bab44 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1548,7 +1548,7 @@ class PointGaussSeidel { } if (init_zero_x_vector) { KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + MyExecSpace, scalar_persistent_work_view2d_t>( my_exec_space, num_cols * block_size, Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< @@ -1665,7 +1665,7 @@ class PointGaussSeidel { } if (init_zero_x_vector) { KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + MyExecSpace, scalar_persistent_work_view2d_t>( my_exec_space, num_cols, Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< From 32944bdffc55e8f76e29c92dc11a858d8b915e3a Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 20 Mar 2024 16:14:31 -0600 Subject: [PATCH 196/326] CMakeLists.txt: Update develop to 4.3.99 --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ab92213fe4..8fee2acd72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 2) +SET(KokkosKernels_VERSION_MINOR 3) SET(KokkosKernels_VERSION_PATCH 99) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") @@ -127,13 +127,13 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - IF((${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.2.00")) + IF((${Kokkos_VERSION} VERSION_EQUAL "4.2.00") OR (${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.3.00")) MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") - IF((${Kokkos_VERSION} VERSION_GREATER "4.2.99")) + IF((${Kokkos_VERSION} VERSION_GREATER "4.3.99")) MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.1.00, 4.2.00 or develop") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.2.00, 4.3.00 or develop") ENDIF() ENDIF() From 237f746e762f6ed9fb502660bd58ecc7a89cf7d2 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 20 Mar 2024 17:20:34 -0600 Subject: [PATCH 197/326] Address reviewer comments --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8fee2acd72..0d75e8f035 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,13 +127,13 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - IF((${Kokkos_VERSION} VERSION_EQUAL "4.2.00") OR (${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.3.00")) + IF(${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.2.00") MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") IF((${Kokkos_VERSION} VERSION_GREATER "4.3.99")) MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.2.00, 4.3.00 or develop") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.2.00 or greater") ENDIF() ENDIF() From 5e9adf527d4b143d4f850800e082f88817992ea5 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 20 Mar 2024 17:32:27 -0600 Subject: [PATCH 198/326] Update CMakeLists.txt Co-authored-by: brian-kelley --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d75e8f035..0c02d71c49 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,7 +133,7 @@ ELSE() MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.2.00 or greater") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.2.00 or greater (found ${Kokkos_VERSION})") ENDIF() ENDIF() From e6315e2b82b66e5aa51b7de3b9905ac94066ea0e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 25 Mar 2024 13:04:47 -0600 Subject: [PATCH 199/326] cm_test_all_sandia: solo updates module updates post TOSS upgrade --- scripts/cm_test_all_sandia | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index eb296091af..db13665a3b 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -743,11 +743,11 @@ elif [ "$MACHINE" = "solo" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - module load cmake/3.22.3 + module load cmake - BASE_MODULE_LIST="cmake/3.22.3,/" - BASE_MODULE_LIST_LLVM="cmake/3.22.3,/,gnu/10.2.1" - BASE_MODULE_LIST_INTEL="cmake/3.22.3,gnu/8.2.1,/" + BASE_MODULE_LIST="cmake,/" + BASE_MODULE_LIST_LLVM="cmake,/,gnu/10.2.1" + BASE_MODULE_LIST_INTEL="cmake,/" ONEAPI_WARNING_FLAGS="" GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" @@ -757,12 +757,19 @@ elif [ "$MACHINE" = "solo" ]; then "llvm/10.0.1 $BASE_MODULE_LIST_LLVM "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then - COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL,mkl/19.0.5.281 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/19.1 $BASE_MODULE_LIST_INTEL,mkl/19.1 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gnu/10.2.1 $GNU102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" ) else - ###"clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + "gnu/11.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + "gnu/12.1.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + "llvm/10.0.1 $BASE_MODULE_LIST_LLVM $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "intel/19.1 $BASE_MODULE_LIST_INTEL,mkl/19.1 "$INTEL_BUILD_LIST" icpc $INTEL_WARNING_FLAGS" + "intel/21.3.0 $BASE_MODULE_LIST_INTEL,mkl/21.3.0 "$INTEL_BUILD_LIST" icpc $INTEL_WARNING_FLAGS" + "intel/23.0.0 $BASE_MODULE_LIST_INTEL,mkl/23.0.0 "$INTEL_BUILD_LIST" icpc $INTEL_WARNING_FLAGS" + "intel/23.1.0 $BASE_MODULE_LIST_INTEL,mkl/23.1.0 "$INTEL_BUILD_LIST" icpc $INTEL_WARNING_FLAGS" + "intel/23.2.0 $BASE_MODULE_LIST_INTEL,mkl/23.2.0 "$INTEL_BUILD_LIST" icpc $INTEL_WARNING_FLAGS" ) fi From 89598d819fb74374d5db988adb19bed86cc5e0d8 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 25 Mar 2024 13:05:47 -0600 Subject: [PATCH 200/326] Fix signed/unsigned comparison warnings (#2150) This is only hit when spmv is called with integer scalars, which doesn't happen in our CI but does often in Tpetra. --- sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 2 +- sparse/impl/KokkosSparse_spmv_spec.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index dddf6e1472..5c2bf0edfa 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -349,7 +349,7 @@ struct SPMV_MV_BSRMATRIX, "This implementation is only for integer Scalar types."); - for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { + for (size_t j = 0; j < X.extent(1); ++j) { const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); typedef SPMV_BSRMATRIX, "This implementation is only for integer Scalar types."); KokkosKernels::Experimental::Controls defaultControls; - for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) { + for (size_t j = 0; j < x.extent(1); ++j) { auto x_j = Kokkos::subview(x, Kokkos::ALL(), j); auto y_j = Kokkos::subview(y, Kokkos::ALL(), j); typedef SPMV Date: Tue, 26 Mar 2024 09:29:36 -0600 Subject: [PATCH 201/326] SPMV tpl fixes, cusparse workaround (#2152) * SPMV tpl fixes, workaround * Avoid possible integer conversion warnings * Document cusparseSpMM algos that were tested --- .../KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 9 +- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 25 ++++-- sparse/unit_test/Test_Sparse_spmv.hpp | 85 +++++++++++++------ 3 files changed, 87 insertions(+), 32 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 0ef473c54a..88fef4421a 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -51,7 +51,14 @@ struct spmv_mv_tpl_spec_avail { non-transpose that produces incorrect result. This is cusparse distributed with CUDA 10.1.243. The bug seems to be resolved by CUSPARSE 10301 (present by CUDA 10.2.89) */ -#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) + +/* cusparseSpMM also produces incorrect results for some inputs in CUDA 11.6.1. + * (CUSPARSE_VERSION 11702). + * ALG1 and ALG3 produce completely incorrect results for one set of inputs. + * ALG2 works for that case, but has low numerical accuracy in another case. + */ +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) && \ + (CUSPARSE_VERSION != 11702) KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 45719def45..47b7d47f8e 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -24,8 +24,12 @@ /* CUSPARSE_VERSION < 10301 either doesn't have cusparseSpMM or the non-tranpose version produces incorrect results. + + Version 11702 corresponds to CUDA 11.6.1, which also produces incorrect + results. 11701 (CUDA 11.6.0) is OK. */ -#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) && \ + (CUSPARSE_VERSION != 11702) #include "cusparse.h" #include "KokkosSparse_Utils_cusparse.hpp" @@ -63,9 +67,14 @@ inline cudaDataType compute_type() { */ template = true> cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { - const int64_t rows = view.extent(0); - const int64_t cols = view.extent(1); - const int64_t ld = view.extent(0); + // If the view is LayoutRight, we still need to create descr as column-major + // but it should be an implicit transpose, meaning dimensions and strides are + // swapped + bool transpose = + std::is_same_v; + const size_t rows = transpose ? view.extent(1) : view.extent(0); + const size_t cols = transpose ? view.extent(0) : view.extent(1); + const size_t ld = transpose ? view.stride(0) : view.stride(1); // cusparseCreateCsr notes it is safe to const_cast this away for input // pointers to a descriptor as long as that descriptor is not an output @@ -83,8 +92,9 @@ cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { const cusparseOrder_t order = CUSPARSE_ORDER_COL; cusparseDnMatDescr_t descr; - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseCreateDnMat(&descr, rows, cols, ld, values, valueType, order)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnMat( + &descr, static_cast(rows), static_cast(cols), + static_cast(ld), values, valueType, order)); return descr; } @@ -143,6 +153,9 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, constexpr bool xIsLR = std::is_same::value; static_assert(xIsLL || xIsLR, "X multivector was not LL or LR (TPL error)"); + static_assert( + std::is_same_v, + "Y multivector was not LL (TPL error)"); cusparseDnMatDescr_t vecX = make_cusparse_dn_mat_descr_t(x); cusparseDnMatDescr_t vecY = make_cusparse_dn_mat_descr_t(y); cusparseOperation_t opB = diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index b377806928..c5107fcf0a 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -518,13 +518,13 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, using handle_t = KokkosSparse::SPMVHandle; - ViewTypeX b_x("A", numRows, numMV); - ViewTypeY b_y("B", numCols, numMV); - ViewTypeY b_y_copy("B", numCols, numMV); + ViewTypeX b_x("A", numCols, numMV); + ViewTypeY b_y("B", numRows, numMV); + ViewTypeY b_y_copy("B", numRows, numMV); - ViewTypeX b_xt("A", numCols, numMV); - ViewTypeY b_yt("B", numRows, numMV); - ViewTypeY b_yt_copy("B", numRows, numMV); + ViewTypeX b_xt("A", numRows, numMV); + ViewTypeY b_yt("B", numCols, numMV); + ViewTypeY b_yt_copy("B", numCols, numMV); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -582,9 +582,9 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, } template -void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, int numMV) { + typename layout_x, typename layout_y, class Device> +void test_spmv_mv_heavy(lno_t numRows, lno_t numCols, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, int numMV) { #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) || defined(KOKKOS_ARCH_A64FX) if (std::is_same>::value) { std::cerr @@ -596,8 +596,8 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL || KOKKOS_ARCH_A64FX using crsMat_t = typename KokkosSparse::CrsMatrix; - using ViewTypeX = Kokkos::View; - using ViewTypeY = Kokkos::View; + using ViewTypeX = Kokkos::View; + using ViewTypeY = Kokkos::View; using mag_t = typename Kokkos::ArithTraits::mag_type; using handle_t = KokkosSparse::SPMVHandle; @@ -607,7 +607,7 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, constexpr mag_t max_val = static_cast(10); crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numRows, nnz, row_size_variance, bandwidth); + numRows, numCols, nnz, row_size_variance, bandwidth); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -615,15 +615,22 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, numRows ? (nnz / numRows + row_size_variance) : 0; for (int nv = 1; nv <= numMV; nv++) { - ViewTypeX b_x("A", numRows, nv); + ViewTypeX b_x("A", numCols, nv); ViewTypeY b_y("B", numRows, nv); ViewTypeY b_y_copy("B", numRows, nv); + ViewTypeX b_xt("A", numRows, nv); + ViewTypeY b_yt("B", numCols, nv); + ViewTypeY b_yt_copy("B", numCols, nv); + Kokkos::fill_random(b_x, rand_pool, scalar_t(10)); Kokkos::fill_random(b_y, rand_pool, scalar_t(10)); + Kokkos::fill_random(b_xt, rand_pool, scalar_t(10)); + Kokkos::fill_random(b_yt, rand_pool, scalar_t(10)); Kokkos::fill_random(input_mat.values, rand_pool, scalar_t(10)); Kokkos::deep_copy(b_y_copy, b_y); + Kokkos::deep_copy(b_yt_copy, b_yt); handle_t handle; @@ -633,9 +640,9 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, "N", max_y); Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, "N", max_y + max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, 1.0, 0.0, nv, "T", max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, 0.0, 1.0, nv, "T", max_y); // Testing all modes together, since matrix is square std::vector modes = {"N", "C", "T", "H"}; @@ -645,8 +652,13 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, alpha, - beta, nv, mode, max_error); + if (*mode == 'N' || *mode == 'C') { + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, alpha, + beta, nv, mode, max_error); + } else { + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, + alpha, beta, nv, mode, max_error); + } } } } @@ -1189,19 +1201,30 @@ void test_spmv_all_interfaces_light() { TestCategory, \ sparse##_##spmv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ test_spmv_mv( \ - 1000, 1000 * 3, 200, 10, true, 1); \ + 1001, 1001 * 3, 200, 10, true, 1); \ test_spmv_mv( \ - 1000, 1000 * 3, 100, 10, true, 5); \ + 999, 999 * 3, 100, 10, true, 5); \ test_spmv_mv( \ - 1000, 1000 * 2, 100, 5, true, 10); \ + 1003, 1003 * 2, 100, 5, true, 10); \ test_spmv_mv( \ - 50000, 50000 * 3, 20, 10, false, 1); \ + 50007, 50007 * 3, 20, 10, false, 1); \ test_spmv_mv( \ - 50000, 50000 * 3, 100, 10, false, 1); \ + 50002, 50002 * 3, 100, 10, false, 1); \ test_spmv_mv( \ 10000, 10000 * 2, 100, 5, false, 5); \ - test_spmv_mv_heavy( \ - 200, 200 * 10, 60, 4, 30); \ + test_spmv_mv_heavy(204, 201, 204 * 10, 60, 4, 30); \ + test_spmv_mv_heavy(2, 3, 5, 3, 1, 10); \ + } + +#define EXECUTE_TEST_MV_MIXED_LAYOUT(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##spmv_mv_mixed_layout##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + test_spmv_mv_heavy(99, 101, 100 * 15, 40, 4, \ + 20); \ } #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \ @@ -1268,8 +1291,20 @@ EXECUTE_TEST_ISSUE_101(TestDevice) #include #undef KOKKOSKERNELS_EXECUTE_TEST +#endif -#endif // defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +// Test that requires mixing LayoutLeft and LayoutRight (never an ETI'd +// combination) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV_MIXED_LAYOUT(SCALAR, ORDINAL, OFFSET, TestDevice) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST +#endif #undef EXECUTE_TEST_FN #undef EXECUTE_TEST_STRUCT From 44b4418db543ac47f3119cae59e1e3a935ce8556 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 26 Mar 2024 09:40:15 -0600 Subject: [PATCH 202/326] Applying clang-format --- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 02626bab44..f0b78408bc 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1547,8 +1547,8 @@ class PointGaussSeidel { Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t>( + KokkosKernels::Impl::zero_vector( my_exec_space, num_cols * block_size, Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< @@ -1664,8 +1664,8 @@ class PointGaussSeidel { Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t>( + KokkosKernels::Impl::zero_vector( my_exec_space, num_cols, Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< From 8756faaa64fc5192e24634386a1051e1e2730c83 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 26 Mar 2024 13:36:56 -0600 Subject: [PATCH 203/326] Use default_size_type as default offset in matrix types (#2149) Now a declaration like CrsMatrix will by default use an ETI'd type combination (as int is the default ETI'd offset) --- sparse/src/KokkosSparse_BsrMatrix.hpp | 5 ++--- sparse/src/KokkosSparse_CrsMatrix.hpp | 4 +--- sparse/src/KokkosSparse_ccs2crs.hpp | 8 ++++++++ sparse/src/KokkosSparse_crs2ccs.hpp | 10 +++++++++- sparse/src/KokkosSparse_crs2coo.hpp | 10 +++++++++- sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp | 8 +++++--- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 4 ++-- test_common/KokkosKernels_TestUtils.hpp | 6 ++---- 8 files changed, 38 insertions(+), 17 deletions(-) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index db9ef71753..06a9ad92cf 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -34,6 +34,7 @@ #include "Kokkos_ArithTraits.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_Error.hpp" +#include "KokkosKernels_default_types.hpp" namespace KokkosSparse { @@ -325,9 +326,7 @@ struct BsrRowViewConst { /// storage for sparse matrices, as described, for example, in Saad /// (2nd ed.). template ::size_type> + class MemoryTraits = void, class SizeType = default_size_type> class BsrMatrix { static_assert( std::is_signed::value, diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index ce9ec99e4e..2e69683441 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -339,9 +339,7 @@ struct SparseRowViewConst { /// storage for sparse matrices, as described, for example, in Saad /// (2nd ed.). template ::size_type> + class MemoryTraits = void, class SizeType = default_size_type> class CrsMatrix { static_assert( std::is_signed::value, diff --git a/sparse/src/KokkosSparse_ccs2crs.hpp b/sparse/src/KokkosSparse_ccs2crs.hpp index 9b4bae2134..fcdf45c9c8 100644 --- a/sparse/src/KokkosSparse_ccs2crs.hpp +++ b/sparse/src/KokkosSparse_ccs2crs.hpp @@ -102,6 +102,14 @@ template auto ccs2crs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, ColMapViewType col_map, RowIdViewType row_ids) { + static_assert( + std::is_same_v, + "ccs2crs: SizeType (type of nnz) must match the element type of " + "ColMapViewType"); + static_assert( + std::is_same_v, + "ccs2crs: OrdinalType (type of nrows, ncols) must match the element type " + "of RowIdViewType"); using Ccs2crsType = Impl::Ccs2Crs; Ccs2crsType ccs2Crs(nrows, ncols, nnz, vals, col_map, row_ids); diff --git a/sparse/src/KokkosSparse_crs2ccs.hpp b/sparse/src/KokkosSparse_crs2ccs.hpp index c9265842cb..4b985b5b6d 100644 --- a/sparse/src/KokkosSparse_crs2ccs.hpp +++ b/sparse/src/KokkosSparse_crs2ccs.hpp @@ -100,6 +100,14 @@ template auto crs2ccs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, RowMapViewType row_map, ColIdViewType col_ids) { + static_assert( + std::is_same_v, + "crs2ccs: SizeType (type of nnz) must match the element type of " + "RowMapViewType"); + static_assert( + std::is_same_v, + "crs2ccs: OrdinalType (type of nrows, ncols) must match the element type " + "of ColIdViewType"); using Crs2ccsType = Impl::Crs2Ccs; Crs2ccsType crs2Ccs(nrows, ncols, nnz, vals, row_map, col_ids); @@ -128,4 +136,4 @@ auto crs2ccs(KokkosSparse::CrsMatrix auto crs2coo(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, RowMapViewType row_map, ColIdViewType col_ids) { + static_assert( + std::is_same_v, + "crs2coo: SizeType (type of nnz) must match the element type of " + "RowMapViewType"); + static_assert( + std::is_same_v, + "crs2coo: OrdinalType (type of nrows, ncols) must match the element type " + "of ColIdViewType"); using Crs2cooType = Impl::Crs2Coo; Crs2cooType crs2Coo(nrows, ncols, nnz, vals, row_map, col_ids); @@ -161,4 +169,4 @@ auto crs2coo(KokkosSparse::CrsMatrix void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + using RandCs = RandCsMatrix; + using size_type = typename RandCs::size_type; auto expected_min = ScalarType(1.0); size_t expected_nnz = 0; - RandCsMatrix cm(m, n, min_val, max_val); + RandCs cm(m, n, min_val, max_val); - for (size_t i = 0; i < cm.get_nnz(); ++i) + for (size_type i = 0; i < cm.get_nnz(); ++i) ASSERT_GE(cm(i), expected_min) << cm.info; auto map_d = cm.get_map(); @@ -83,4 +85,4 @@ TEST_F(TestCategory, sparse_randcsmat) { doAllCsMat(dim, dim * 3); } } -} // namespace Test \ No newline at end of file +} // namespace Test diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index 6482d33d8a..cb42f5c2e4 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -134,9 +134,9 @@ Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { rcs(blockRows, blockCols, scalar_type(0), max_a()); const auto colids = Kokkos::subview( - rcs.get_ids(), Kokkos::make_pair(size_t(0), rcs.get_nnz())); + rcs.get_ids(), Kokkos::make_pair(size_type(0), rcs.get_nnz())); const auto vals = Kokkos::subview( - rcs.get_vals(), Kokkos::make_pair(size_t(0), rcs.get_nnz())); + rcs.get_vals(), Kokkos::make_pair(size_type(0), rcs.get_nnz())); Graph graph(colids, rcs.get_map()); Crs crs("crs", blockCols, vals, graph); diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 232b66242a..50a28cb1e8 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -643,9 +643,7 @@ class RandCooMat { /// \tparam LayoutType /// \tparam Device template ::size_type> + typename Ordinal = int64_t, typename Size = default_size_type> class RandCsMatrix { public: using value_type = ScalarType; @@ -765,7 +763,7 @@ class RandCsMatrix { // O(c), where c is a constant. ScalarType operator()(Size idx) { return __vals(idx); } - size_t get_nnz() { return size_t(__nnz); } + Size get_nnz() { return __nnz; } // dimension2: This is either columns for a Crs matrix or rows for a Ccs // matrix. Ordinal get_dim2() { return __dim2; } From fb8eff56012e6931ca2f33151accb94c092e9e6f Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 26 Mar 2024 16:46:14 -0600 Subject: [PATCH 204/326] spmv bsr perftest: move fences to right place (#2153) --- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp index d96a3c6c8d..bd0afc9a1b 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -183,8 +183,8 @@ int test_bsr_matrix_single_vec( Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); - time_crs += timer.seconds(); Kokkos::fence(); + time_crs += timer.seconds(); } // Create the output vector @@ -212,8 +212,8 @@ int test_bsr_matrix_single_vec( Kokkos::deep_copy(ybsr, h_ybsr); Kokkos::Timer timer; KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); - time_bsr += timer.seconds(); Kokkos::fence(); + time_bsr += timer.seconds(); } // Check that the numerical result is matching @@ -350,8 +350,8 @@ int test_bsr_matrix_vec( Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); - time_crs += timer.seconds(); Kokkos::fence(); + time_crs += timer.seconds(); } // Create the BsrMatrix variable @@ -380,8 +380,8 @@ int test_bsr_matrix_vec( Kokkos::deep_copy(ybsr, h_ybsr); Kokkos::Timer timer; KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); - time_bsr += timer.seconds(); Kokkos::fence(); + time_bsr += timer.seconds(); } // Check that the result is matching From aadab525f75169b3ca2da8b5b6982f72b4976364 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 27 Mar 2024 09:39:25 -0600 Subject: [PATCH 205/326] Kokkos Kernels: removing old code branches ahead of 4.3.00 release (#2139) Some old code branches kept for compatibility with Kokkos Core versions less than 4.2.00 are dropped after release 4.3.00. The largest changes are the removal of support the Kokkos print macro in favor of Kokkos::printf() and the removal of half support from Kokkos Kernels since it is now in Kokkos Core. --- .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 42 --- .../dense/impl/KokkosBatched_Copy_Impl.hpp | 50 --- .../dense/impl/KokkosBatched_Dot_Internal.hpp | 93 ------ .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 75 ----- .../KokkosBatched_HadamardProduct_Impl.hpp | 48 --- .../dense/impl/KokkosBatched_Xpay_Impl.hpp | 42 --- .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 78 ----- .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 78 ----- .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 78 ----- .../sparse/src/KokkosBatched_JacobiPrec.hpp | 14 - blas/src/KokkosBlas1_nrm2.hpp | 7 - blas/unit_test/Test_Blas2_ger.hpp | 122 ------- blas/unit_test/Test_Blas2_syr.hpp | 98 ------ common/src/KokkosKernels_Error.hpp | 11 - common/src/KokkosKernels_Half.hpp | 65 ---- common/src/KokkosKernels_SimpleUtils.hpp | 8 - common/src/Kokkos_ArithTraits.hpp | 288 ----------------- common/unit_test/Test_Common_ArithTraits.hpp | 298 ------------------ common/unit_test/Test_Common_LowerBound.hpp | 12 - common/unit_test/Test_Common_UpperBound.hpp | 12 - ode/impl/KokkosODE_Newton_impl.hpp | 5 - sparse/src/KokkosSparse_spmv_team.hpp | 28 -- sparse/unit_test/Test_Sparse_spmv.hpp | 13 - test_common/KokkosKernels_TestUtils.hpp | 20 -- 24 files changed, 1585 deletions(-) delete mode 100644 common/src/KokkosKernels_Half.hpp diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index 400c46544d..da9d607241 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -199,31 +199,17 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " - "%d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } #endif @@ -263,31 +249,17 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " - "%d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } #endif @@ -332,31 +304,17 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " - "%d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index 5b693bb87a..0a8c9d456f 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -59,17 +59,10 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " - "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#else Kokkos::printf( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#endif return 1; } #endif @@ -94,17 +87,10 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " - "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#else Kokkos::printf( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#endif return 1; } #endif @@ -157,21 +143,12 @@ struct TeamCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " - "%d, " - "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); -#else Kokkos::printf( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#endif return 1; } #endif @@ -204,21 +181,12 @@ struct TeamCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " - "%d, " - "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); -#else Kokkos::printf( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#endif return 1; } #endif @@ -277,21 +245,12 @@ struct TeamVectorCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " - "%d, " - "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); -#else Kokkos::printf( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#endif return 1; } #endif @@ -324,21 +283,12 @@ struct TeamVectorCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " - "%d, " - "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); -#else Kokkos::printf( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); -#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index 854069289e..a0960c621b 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -186,35 +186,19 @@ struct SerialDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(1) != dot.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Second dimension of X and alpha do not match: " - "X: " - "%d x %d, dot: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#else Kokkos::printf( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#endif return 1; } #endif @@ -248,33 +232,18 @@ struct SerialDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != dot.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: First dimension of X and alpha do not match: X: " - "%d x %d, dot: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#else Kokkos::printf( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#endif return 1; } #endif @@ -313,35 +282,19 @@ struct TeamDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(1) != dot.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Second dimension of X and alpha do not match: " - "X: " - "%d x %d, dot: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#else Kokkos::printf( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#endif return 1; } #endif @@ -384,33 +337,18 @@ struct TeamDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != dot.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: First dimension of X and alpha do not match: X: " - "%d x %d, dot: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#else Kokkos::printf( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#endif return 1; } #endif @@ -457,35 +395,19 @@ struct TeamVectorDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(1) != dot.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Second dimension of X and alpha do not match: " - "X: " - "%d x %d, dot: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#else Kokkos::printf( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#endif return 1; } #endif @@ -528,33 +450,18 @@ struct TeamVectorDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != dot.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::dot: First dimension of X and alpha do not match: X: " - "%d x %d, dot: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#else Kokkos::printf( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); -#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 86d0d0873e..4c9f54d037 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -388,39 +388,22 @@ struct SerialGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " - "%d x %d, tmp (note: its second dimension should be the second " - "dimension of A + 4): %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), - (int)tmp.extent(1)); -#else Kokkos::printf( "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " "%d x %d, tmp (note: its second dimension should be the second " "dimension of A + 4): %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), (int)tmp.extent(1)); -#endif return 1; } if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); -#else Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); -#endif return 1; } #endif @@ -435,15 +418,9 @@ struct SerialGesv { if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: the currently implemented static pivoting " - "failed.\n"); -#else Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); -#endif return 1; } @@ -489,19 +466,11 @@ struct SerialGesv { if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); -#else Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); -#endif return 1; } #endif @@ -548,19 +517,11 @@ struct TeamGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); -#else Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); -#endif return 1; } #endif @@ -579,15 +540,9 @@ struct TeamGesv { if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: the currently implemented static pivoting " - "failed.\n"); -#else Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); -#endif return 1; } member.team_barrier(); @@ -640,19 +595,11 @@ struct TeamGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); -#else Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); -#endif return 1; } #endif @@ -706,19 +653,11 @@ struct TeamVectorGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); -#else Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); -#endif return 1; } #endif @@ -737,15 +676,9 @@ struct TeamVectorGesv { if (TeamVectorStaticPivoting::invoke( member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: the currently implemented static pivoting " - "failed.\n"); -#else Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); -#endif return 1; } @@ -799,19 +732,11 @@ struct TeamVectorGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); -#else Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); -#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index 0570bc4ccc..90b89e4ad1 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -110,35 +110,19 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " - "X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " - "X: %d x %d, " - "V: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); -#else Kokkos::printf( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); -#endif return 1; } #endif @@ -177,35 +161,19 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " - "X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " - "X: %d x %d, " - "V: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); -#else Kokkos::printf( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); -#endif return 1; } #endif @@ -246,35 +214,19 @@ KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " - "X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " - "X: %d x %d, " - "V: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); -#else Kokkos::printf( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); -#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index 5e5b7e13cc..52e1425041 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -204,31 +204,17 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " - "%d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } #endif @@ -261,31 +247,17 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " - "%d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } #endif @@ -319,31 +291,17 @@ KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " - "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " - "%d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index b96dc79a80..0f1e5feb39 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -153,95 +153,49 @@ struct SerialSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " - "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and alpha do not match: " - "X: %d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } if (X.extent(0) != beta.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and beta do not match: X: " - "%d x %d, beta: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); -#endif return 1; } if (X.extent(0) != values.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and the first dimension " - "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of colIndices and the second " - "dimension of values do not match: colIndices: %d , values: %d x " - "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " - "of X do not match: colIndices (-1): %d , values: %d x %d\n", - (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#endif return 1; } #endif @@ -289,67 +243,35 @@ struct SerialSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " - "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != values.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and the first dimension " - "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of colIndices and the second " - "dimension of values do not match: colIndices: %d , values: %d x " - "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " - "of X do not match: colIndices (-1): %d , values: %d x %d\n", - (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index d7379777be..dd510b2d0e 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -341,95 +341,49 @@ struct TeamVectorSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " - "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and alpha do not match: " - "X: %d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } if (X.extent(0) != beta.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and beta do not match: X: " - "%d x %d, beta: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); -#endif return 1; } if (X.extent(0) != values.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and the first dimension " - "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of colIndices and the second " - "dimension of values do not match: colIndices: %d , values: %d x " - "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " - "of X do not match: colIndices (-1): %d , values: %d x %d\n", - (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#endif return 1; } #endif @@ -484,67 +438,35 @@ struct TeamVectorSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " - "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != values.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and the first dimension " - "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of colIndices and the second " - "dimension of values do not match: colIndices: %d , values: %d x " - "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " - "of X do not match: colIndices (-1): %d , values: %d x %d\n", - (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index beb53521f0..41128744a3 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -192,95 +192,49 @@ struct TeamSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " - "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != alpha.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and alpha do not match: " - "X: %d x %d, alpha: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); -#endif return 1; } if (X.extent(0) != beta.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and beta do not match: X: " - "%d x %d, beta: %d\n", - (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); -#endif return 1; } if (X.extent(0) != values.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and the first dimension " - "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of colIndices and the second " - "dimension of values do not match: colIndices: %d , values: %d x " - "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " - "of X do not match: colIndices (-1): %d , values: %d x %d\n", - (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#endif return 1; } #endif @@ -335,67 +289,35 @@ struct TeamSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " - "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); -#endif return 1; } if (X.extent(0) != values.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: First dimension of X and the first dimension " - "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of colIndices and the second " - "dimension of values do not match: colIndices: %d , values: %d x " - "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); -#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " - "of X do not match: colIndices (-1): %d , values: %d x %d\n", - (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#else Kokkos::printf( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); -#endif return 1; } #endif diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 44a982525d..eacb859636 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -109,17 +109,10 @@ class JacobiPrec { } if (tooSmall > 0) -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " - "magnitude and have been replaced by one, \n", - (int)tooSmall); -#else Kokkos::printf( "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " "magnitude and have been replaced by one, \n", (int)tooSmall); -#endif computed_inverse = true; } @@ -138,17 +131,10 @@ class JacobiPrec { } if (tooSmall > 0) -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " - "magnitude and have been replaced by one, \n", - (int)tooSmall); -#else Kokkos::printf( "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " "magnitude and have been replaced by one, \n", (int)tooSmall); -#endif computed_inverse = true; } diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index 64643367a0..59f105f5a4 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -241,17 +241,10 @@ KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { " Kokkos::ArithTraits::mag_type"); if (R.extent(0) != X.extent(1)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," - " R: %d and X: %d x %d.\n", - R.extent_int(0), X.extent_int(0), X.extent_int(1)); -#else Kokkos::printf( "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," " R: %d and X: %d x %d.\n", R.extent_int(0), X.extent_int(0), X.extent_int(1)); -#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index df3d2cb5d1..9a8f740569 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -295,15 +295,9 @@ void GerTester h_vanilla( "vanilla = A + alpha * x * y^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", - typeid(alpha).name()); -#else Kokkos::printf( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); -#endif #endif this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); @@ -1364,17 +1358,10 @@ void GerTester& A, const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): " - "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", - typeid(_ViewTypeA).name(), _kkGerShouldThrowException); -#else Kokkos::printf( "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): " "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException); -#endif #endif std::string mode = _useHermitianOption ? "H" : "T"; bool gotStdException(false); @@ -1416,22 +1403,11 @@ void GerTester #ifdef HAVE_KOKKOSKERNELS_DEBUG int test_ger(const std::string& caseName) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "+=======================================================================" - "===\n"); -#else Kokkos::printf( "+=======================================================================" "===\n"); -#endif -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", - caseName.c_str(), typeid(Device).name()); -#else Kokkos::printf("Starting %s, device = %s ...\n", caseName.c_str(), typeid(Device).name()); -#endif #else int test_ger(const std::string& /*caseName*/) { #endif @@ -1453,21 +1429,10 @@ int test_ger(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "+-----------------------------------------------------------------------" - "---\n"); -#else Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); -#endif -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", - caseName.c_str()); -#else Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); -#endif #endif if (true) { Test::GerTester::test( view_stride_adapter<_ViewTypeExpected, true> h_vanilla( "vanilla = A + alpha * x * x^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", - typeid(alpha).name()); -#else Kokkos::printf( "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); -#endif #endif this->populateVanillaValues(alpha, x.h_view, A.h_view, h_vanilla.d_view); @@ -1441,17 +1435,10 @@ void SyrTester:: #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " - "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", - typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); -#else Kokkos::printf( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); -#endif #endif std::string mode = _useHermitianOption ? "H" : "T"; std::string uplo = _useUpOption ? "U" : "L"; @@ -1505,17 +1492,10 @@ void SyrTester:: #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " - "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", - typeid(_ViewTypeA).name(), _kkGerShouldThrowException); -#else Kokkos::printf( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException); -#endif #endif std::string mode = _useHermitianOption ? "H" : "T"; bool gotStdException(false); @@ -1582,17 +1562,10 @@ void SyrTester:: template #ifdef HAVE_KOKKOSKERNELS_DEBUG int test_syr(const std::string& caseName) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "+=======================================================================" - "===\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); -#else Kokkos::printf( "+=======================================================================" "===\n"); Kokkos::printf("Starting %s ...\n", caseName.c_str()); -#endif #else int test_syr(const std::string& /*caseName*/) { #endif @@ -1610,18 +1583,10 @@ int test_syr(const std::string& /*caseName*/) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "+-----------------------------------------------------------------------" - "---\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", - caseName.c_str()); -#else Kokkos::printf( "+-----------------------------------------------------------------------" "---\n"); Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); -#endif #endif if (true) { Test::SyrTester mag_type(eps)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " - "(eps = %e)\n", - (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), - KAT::imag(view2(i)), eps); -#else Kokkos::printf( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " "(eps = %e)\n", (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), KAT::imag(view2(i)), eps); -#endif num_diffs++; } } diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 75c0951e10..415189be93 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -25,9 +25,6 @@ #include #include #include -#if KOKKOS_VERSION < 40199 -#include -#endif #include @@ -1003,109 +1000,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = true; -#if KOKKOS_VERSION < 40199 - static KOKKOS_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_half( - Kokkos::Experimental::infinity::value); - } - - static KOKKOS_FUNCTION bool isInf(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isinf; -#endif - return isinf(Kokkos::Experimental::cast_from_half(x)); - } - static KOKKOS_FUNCTION bool isNan(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isnan; -#endif - return isnan(Kokkos::Experimental::cast_from_half(x)); - } - static KOKKOS_FUNCTION mag_type abs(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::abs(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type zero() { - return Kokkos::Experimental::cast_to_half(0.0); - } - static KOKKOS_FUNCTION val_type one() { - return Kokkos::Experimental::cast_to_half(1.0); - } - static KOKKOS_FUNCTION val_type min() { - return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); - } - static KOKKOS_FUNCTION val_type max() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); - } - static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } - static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } - static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { - return Kokkos::Experimental::cast_to_half( - Kokkos::pow(Kokkos::Experimental::cast_from_half(x), - Kokkos::Experimental::cast_from_half(y))); - } - static KOKKOS_FUNCTION val_type sqrt(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::sqrt(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type cbrt(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::cbrt(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type exp(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::exp(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type log(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::log(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type log10(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::log10(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type sin(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::sin(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type cos(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::cos(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type tan(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::tan(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type sinh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::sinh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type cosh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::cosh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type tanh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::tanh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type asin(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::asin(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type acos(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::acos(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION val_type atan(const val_type x) { - return Kokkos::Experimental::cast_to_half( - Kokkos::atan(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FUNCTION mag_type epsilon() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); - } -#endif - // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; using halfPrecision = Kokkos::Experimental::half_t; @@ -1118,52 +1012,11 @@ class ArithTraits { static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = true; -#if KOKKOS_VERSION < 40199 - static KOKKOS_FUNCTION bool isnaninf(const val_type x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FUNCTION val_type nan() { - return Kokkos::Experimental::cast_to_half( - Kokkos::Experimental::quiet_NaN::value); - } - static KOKKOS_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); - } - static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_FP16_RADIX; } - // Use float to allow running on both host and device - static KOKKOS_FUNCTION float prec() { - float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; - float b = (float)base(); - float r = e * b; - return r; - } - static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; } - static KOKKOS_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; } - static KOKKOS_FUNCTION mag_type rmin() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); - } - static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; } - static KOKKOS_FUNCTION mag_type rmax() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); - } -#else #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) KOKKOSKERNELS_ARITHTRAITS_HALF_FP(KOKKOS_FUNCTION) #else KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) #endif -#endif }; #endif // #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT @@ -1183,106 +1036,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = true; -#if KOKKOS_VERSION < 40199 - static KOKKOS_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::Experimental::infinity::value); - } - - static KOKKOS_FUNCTION bool isInf(const val_type x) { - return Kokkos::isinf(Kokkos::Experimental::cast_from_bhalf(x)); - } - static KOKKOS_FUNCTION bool isNan(const val_type x) { - return Kokkos::isnan(Kokkos::Experimental::cast_from_bhalf(x)); - } - static KOKKOS_FUNCTION mag_type abs(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::abs(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type zero() { - return Kokkos::Experimental::cast_to_bhalf(0.0F); - } - static KOKKOS_FUNCTION val_type one() { - return Kokkos::Experimental::cast_to_bhalf(1.0F); - } - static KOKKOS_FUNCTION val_type min() { - return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX); - } - static KOKKOS_FUNCTION val_type max() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); - } - static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FUNCTION mag_type imag(const val_type) { - return Kokkos::Experimental::cast_to_bhalf(0.0F); - } - static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } - static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::pow(Kokkos::Experimental::cast_from_bhalf(x), - Kokkos::Experimental::cast_from_bhalf(y))); - } - static KOKKOS_FUNCTION val_type sqrt(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::sqrt(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type cbrt(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::cbrt(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type exp(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::exp(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type log(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::log(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type log10(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::log10(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type sin(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::sin(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type cos(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::cos(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type tan(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::tan(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type sinh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::sinh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type cosh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::cosh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type tanh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::tanh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type asin(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::asin(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type acos(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::acos(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION val_type atan(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::atan(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FUNCTION mag_type epsilon() { - // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); - } -#endif - // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; using bhalfPrecision = Kokkos::Experimental::bhalf_t; @@ -1297,52 +1050,11 @@ class ArithTraits { static std::string name() { return "bhalf_t"; } -#if KOKKOS_VERSION < 40199 - static KOKKOS_FUNCTION bool isnaninf(const val_type x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FUNCTION val_type nan() { - return Kokkos::Experimental::cast_to_bhalf( - Kokkos::Experimental::quiet_NaN::value); - } - static KOKKOS_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); - } - static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_BF16_RADIX; } - // Use float to allow running on both host and device - static KOKKOS_FUNCTION float prec() { - float e = KOKKOSKERNELS_IMPL_BF16_EPSILON; - float b = (float)base(); - float r = e * b; - return r; - } - static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; } - static KOKKOS_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; } - static KOKKOS_FUNCTION mag_type rmin() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); - } - static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; } - static KOKKOS_FUNCTION mag_type rmax() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); - } -#else #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) KOKKOSKERNELS_ARITHTRAITS_HALF_FP(KOKKOS_FUNCTION) #else KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) #endif -#endif }; #endif // #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 1d9a4c6480..8c493a3666 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -35,30 +35,15 @@ #include // typeid (T) #include -#if KOKKOS_VERSION < 40199 -#define FAILURE() \ - { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%s:%d: Failure\n", __FILE__, __func__, \ - __LINE__); \ - success = 0; \ - } -#else #define FAILURE() \ { \ Kokkos::printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); \ success = 0; \ } -#endif #if 0 -#if KOKKOS_VERSION < 40199 -#define TRACE() \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%s:%d: Trace\n", __FILE__, __func__, \ - __LINE__); -#else #define TRACE() \ Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); -#endif #else #define TRACE() #endif @@ -194,11 +179,7 @@ class ArithTraitsTesterBase { // T, but we check for this int constant for compatibility with // std::numeric_limits. if (!AT::is_specialized) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT::is_specialized\n"); -#else Kokkos::printf("! AT::is_specialized\n"); -#endif FAILURE(); } @@ -206,21 +187,11 @@ class ArithTraitsTesterBase { // function, just not to its class methods (which are not marked // as device functions). if (AT::is_integer != std::numeric_limits::is_integer) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT::is_integer not same as numeric_limits\n"); -#else Kokkos::printf("AT::is_integer not same as numeric_limits\n"); -#endif FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT::is_exact not same as numeric_limits\n"); -#else Kokkos::printf("AT::is_exact not same as numeric_limits\n"); -#endif FAILURE(); } @@ -229,62 +200,34 @@ class ArithTraitsTesterBase { // Test properties of the arithmetic and multiplicative identities. if (zero + zero != zero) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 + 0 != 0\n"); -#else Kokkos::printf("0 + 0 != 0\n"); -#endif FAILURE(); } if (zero + one != one) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 + 1 != 1\n"); -#else Kokkos::printf("0 + 1 != 1\n"); -#endif FAILURE(); } if (one - one != zero) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 - 1 != 0\n"); -#else Kokkos::printf("1 - 1 != 0\n"); -#endif FAILURE(); } // This is technically 1 even of Z_2, since in that field, one // is its own inverse (so -one == one). if ((one + one) - one != one) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("(1 + 1) - 1 != 1\n"); -#else Kokkos::printf("(1 + 1) - 1 != 1\n"); -#endif FAILURE(); } if (AT::abs(zero) != zero) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) != 0\n"); -#else Kokkos::printf("AT::abs(0) != 0\n"); -#endif FAILURE(); } if (AT::abs(one) != one) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(1) != 1\n"); -#else Kokkos::printf("AT::abs(1) != 1\n"); -#endif FAILURE(); } if (AT::is_signed && AT::abs(-one) != one) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::is_signed and AT::abs(-1) != 1\n"); -#else Kokkos::printf("AT::is_signed and AT::abs(-1) != 1\n"); -#endif FAILURE(); } // Need enable_if to test whether T can be compared using <=. @@ -293,11 +236,7 @@ class ArithTraitsTesterBase { // These are very mild ordering properties. // They should work even for a set only containing zero. if (AT::abs(zero) > AT::abs(AT::max())) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) > AT::abs (AT::max ())\n"); -#else Kokkos::printf("AT::abs(0) > AT::abs (AT::max ())\n"); -#endif FAILURE(); } @@ -621,36 +560,20 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(two, three); if (!equal(result, eight)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(2,3) != 8\n"); -#else Kokkos::printf("AT::pow(2,3) != 8\n"); -#endif FAILURE(); } } if (!equal(AT::pow(three, zero), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,0) != 1\n"); -#else Kokkos::printf("AT::pow(3,0) != 1\n"); -#endif FAILURE(); } if (!equal(AT::pow(three, one), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,1) != 3\n"); -#else Kokkos::printf("AT::pow(3,1) != 3\n"); -#endif FAILURE(); } if (!equal(AT::pow(three, two), nine)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,2) != 9\n"); -#else Kokkos::printf("AT::pow(3,2) != 9\n"); -#endif FAILURE(); } @@ -658,11 +581,7 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(three, three); if (!equal(result, twentySeven)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,3) != 27\n"); -#else Kokkos::printf("AT::pow(3,3) != 27\n"); -#endif FAILURE(); } } @@ -671,170 +590,93 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_signed && !AT::is_complex) { result = AT::pow(-three, one); if (!equal(result, -three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,1) != -3\n"); -#else Kokkos::printf("AT::pow(-3,1) != -3\n"); -#endif FAILURE(); } result = AT::pow(-three, two); if (!equal(result, nine)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,2) != 9\n"); -#else Kokkos::printf("AT::pow(-3,2) != 9\n"); -#endif FAILURE(); } result = AT::pow(-three, three); if (!equal(result, -twentySeven)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,3) != 27\n"); -#else Kokkos::printf("AT::pow(-3,3) != 27\n"); -#endif FAILURE(); } } if (!equal(AT::sqrt(zero), zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(0) != 0\n"); -#else Kokkos::printf("AT::sqrt(0) != 0\n"); -#endif FAILURE(); } if (!equal(AT::sqrt(one), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(1) != 1\n"); -#else Kokkos::printf("AT::sqrt(1) != 1\n"); -#endif FAILURE(); } if (!equal(AT::sqrt(thirtySix), six)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(36) != 6\n"); -#else Kokkos::printf("AT::sqrt(36) != 6\n"); -#endif FAILURE(); } if (!equal(AT::sqrt(sixtyFour), eight)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(64) != 8\n"); -#else Kokkos::printf("AT::sqrt(64) != 8\n"); -#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::sqrt(fortyTwo), six)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:sqrt(42) != 6\n"); -#else Kokkos::printf("AT:sqrt(42) != 6\n"); -#endif FAILURE(); } if (!equal(AT::sqrt(oneTwentySeven), eleven)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(127) != 11\n"); -#else Kokkos::printf("AT::sqrt(127) != 11\n"); -#endif FAILURE(); } } if (!equal(AT::cbrt(zero), zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n"); -#else Kokkos::printf("AT::cbrt(0) != 0\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(one), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n"); -#else Kokkos::printf("AT::cbrt(1) != 1\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(twentySeven), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n"); -#else Kokkos::printf("AT::cbrt(27) != 3\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(sixtyFour), four)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n"); -#else Kokkos::printf("AT::cbrt(64) != 4\n"); -#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt(fortyTwo), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n"); -#else Kokkos::printf("AT:cbrt(42) != 3\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(oneTwentySeven), five)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n"); -#else Kokkos::printf("AT::cbrt(127) != 5\n"); -#endif FAILURE(); } } if (!equal(AT::exp(zero), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n"); -#else Kokkos::printf("AT::cbrt(0) != 1\n"); -#endif FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); -#else Kokkos::printf( "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); -#endif FAILURE(); } } if (!equal(AT::log(one), zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log(1) != 0\n"); -#else Kokkos::printf("AT::log(1) != 0\n"); -#endif FAILURE(); } if (!equal(AT::log10(one), zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log10(1) != 0\n"); -#else Kokkos::printf("AT::log10(1) != 0\n"); -#endif FAILURE(); } @@ -843,23 +685,13 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); -#else Kokkos::printf( "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); -#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#else Kokkos::printf( "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#endif FAILURE(); } } else { @@ -867,47 +699,25 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); -#else Kokkos::printf("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); -#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#else Kokkos::printf("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#endif FAILURE(); } } if (!equal(AT::asin(AT::sin(one)), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(1)) != 1\n"); -#else Kokkos::printf("AT::asin(sin(1)) != 1\n"); -#endif FAILURE(); } if (!equal(AT::acos(AT::cos(one)), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(1)) != 1\n"); -#else Kokkos::printf("AT::acos(cos(1)) != 1\n"); -#endif FAILURE(); } if (!equal(AT::atan(AT::tan(one)), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(1)) != 1\n"); -#else Kokkos::printf("AT::atan(tan(1)) != 1\n"); -#endif FAILURE(); } @@ -1034,74 +844,41 @@ class ArithTraitsTesterTranscendentalBase } if (!equal(AT::cbrt(zero), zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n"); -#else Kokkos::printf("AT::cbrt(0) != 0\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(one), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n"); -#else Kokkos::printf("AT::cbrt(1) != 1\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(twentySeven), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n"); -#else Kokkos::printf("AT::cbrt(27) != 3\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(sixtyFour), four)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n"); -#else Kokkos::printf("AT::cbrt(64) != 4\n"); -#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt(fortyTwo), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n"); -#else Kokkos::printf("AT:cbrt(42) != 3\n"); -#endif FAILURE(); } if (!equal(AT::cbrt(oneTwentySeven), five)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n"); -#else Kokkos::printf("AT::cbrt(127) != 5\n"); -#endif FAILURE(); } } if (!equal(AT::exp(zero), one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n"); -#else Kokkos::printf("AT::cbrt(0) != 1\n"); -#endif FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); -#else Kokkos::printf( "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); -#endif FAILURE(); } } @@ -1119,23 +896,13 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); -#else Kokkos::printf( "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); -#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#else Kokkos::printf( "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#endif FAILURE(); } } else { @@ -1143,47 +910,25 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); -#else Kokkos::printf("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); -#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#else Kokkos::printf("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); -#endif FAILURE(); } } if (!equal(AT::asin(AT::sin(three)), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(3)) != 3\n"); -#else Kokkos::printf("AT::asin(sin(3)) != 3\n"); -#endif FAILURE(); } if (!equal(AT::acos(AT::cos(three)), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(3)) != 3\n"); -#else Kokkos::printf("AT::acos(cos(3)) != 3\n"); -#endif FAILURE(); } if (!equal(AT::atan(AT::tan(three)), three)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(3)) != 3\n"); -#else Kokkos::printf("AT::atan(tan(3)) != 3\n"); -#endif FAILURE(); } @@ -1275,17 +1020,10 @@ class ArithTraitsTesterComplexBase #else { if (AT::is_signed != std::numeric_limits::is_signed) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "AT::is_signed = 0x%x, std::numeric_limits::is_signed " - "= 0x%x\n", - AT::is_signed, std::numeric_limits::is_signed); -#else Kokkos::printf( "AT::is_signed = 0x%x, std::numeric_limits::is_signed " "= 0x%x\n", AT::is_signed, std::numeric_limits::is_signed); -#endif FAILURE(); } } @@ -1498,11 +1236,7 @@ class ArithTraitsTesterFloatingPointBase int success = 1; if (AT::is_exact) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::is_exact is 1\n"); -#else Kokkos::printf("AT::is_exact is 1\n"); -#endif FAILURE(); } @@ -1517,11 +1251,7 @@ class ArithTraitsTesterFloatingPointBase { #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP if (!AT::isNan(AT::nan())) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("NaN is not NaN\n"); -#else Kokkos::printf("NaN is not NaN\n"); -#endif FAILURE(); } } @@ -1530,56 +1260,32 @@ class ArithTraitsTesterFloatingPointBase const ScalarType one = AT::one(); if (AT::isInf(zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is Inf\n"); -#else Kokkos::printf("0 is Inf\n"); -#endif FAILURE(); } if (AT::isInf(one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is Inf\n"); -#else Kokkos::printf("1 is Inf\n"); -#endif FAILURE(); } #if defined(KOKKOS_ENABLE_SYCL) || \ defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP if constexpr (!std::is_same_v) { if (AT::isNan(zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); -#else Kokkos::printf("0 is NaN\n"); -#endif FAILURE(); } if (AT::isNan(one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); -#else Kokkos::printf("1 is NaN\n"); -#endif FAILURE(); } } #else if (AT::isNan(zero)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); -#else Kokkos::printf("0 is NaN\n"); -#endif FAILURE(); } if (AT::isNan(one)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); -#else Kokkos::printf("1 is NaN\n"); -#endif FAILURE(); } #endif @@ -1696,11 +1402,7 @@ class ArithTraitsTesterFloatingPointBase int success = 1; if (!AT::is_exact) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT:is_exact\n"); -#else Kokkos::printf("! AT:is_exact\n"); -#endif FAILURE(); } diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index 6ca28b8be1..23574087ff 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -43,14 +43,8 @@ struct ThreadLowerBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::lower_bound_thread(haystack_, needle_); if (idx != expected_) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(i), - int(expected_), int(idx)); -#else Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); -#endif ++lerrCount; } } @@ -105,14 +99,8 @@ struct TeamLowerBoundFunctor { hv_size_type idx = KokkosKernels::lower_bound_team(handle, haystack_, needle_); if (idx != expected_) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(handle.team_rank()), - int(expected_), int(idx)); -#else Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); -#endif ++lerrCount; } } diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index 113b76c3ad..aace02a738 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -43,14 +43,8 @@ struct ThreadUpperBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); if (idx != expected_) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(i), - int(expected_), int(idx)); -#else Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); -#endif ++lerrCount; } } @@ -105,14 +99,8 @@ struct TeamUpperBoundFunctor { hv_size_type idx = KokkosKernels::upper_bound_team(handle, haystack_, needle_); if (idx != expected_) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", - __FILE__, __LINE__, int(handle.team_rank()), - int(expected_), int(idx)); -#else Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); -#endif ++lerrCount; } } diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp index 348bf0aa22..ae573801ac 100644 --- a/ode/impl/KokkosODE_Newton_impl.hpp +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -99,12 +99,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( } if (linSolverStat == 1) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Linear solve gesv returned failure! \n"); -#else Kokkos::printf("NewtonFunctor: Linear solve gesv returned failure! \n"); -#endif return newton_solver_status::LIN_SOLVE_FAIL; } diff --git a/sparse/src/KokkosSparse_spmv_team.hpp b/sparse/src/KokkosSparse_spmv_team.hpp index 5c9e843669..6c68478501 100644 --- a/sparse/src/KokkosSparse_spmv_team.hpp +++ b/sparse/src/KokkosSparse_spmv_team.hpp @@ -55,32 +55,18 @@ int KOKKOS_INLINE_FUNCTION team_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " - "values: %d, colIndices: %d", - (int)values.extent(0), (int)colIndices.extent(0)); -#else Kokkos::printf( "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " "values: %d, colIndices: %d", (int)values.extent(0), (int)colIndices.extent(0)); -#endif return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " - "x: %d, y: %d, row_ptr: %d", - (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); -#else Kokkos::printf( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); -#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL @@ -123,32 +109,18 @@ int KOKKOS_INLINE_FUNCTION team_vector_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " - "values: %d, colIndices: %d", - (int)values.extent(0), (int)colIndices.extent(0)); -#else Kokkos::printf( "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " "values: %d, colIndices: %d", (int)values.extent(0), (int)colIndices.extent(0)); -#endif return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " - "x: %d, y: %d, row_ptr: %d", - (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); -#else Kokkos::printf( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); -#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index c5107fcf0a..9afd941c93 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -88,15 +88,9 @@ struct fSPMV { if (error > eps * max_val) { err++; -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, - AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); -#else Kokkos::printf("expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); -#endif } } @@ -106,16 +100,9 @@ struct fSPMV { if (error > eps * max_val) { err++; -#if KOKKOS_VERSION < 40199 - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", i, j, - AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, - eps * max_val); -#else Kokkos::printf("expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", i, j, AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, eps * max_val); -#endif } } }; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 50a28cb1e8..632279fa57 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -411,26 +411,6 @@ class epsilon { constexpr static double value = std::numeric_limits::epsilon(); }; -#if KOKKOS_VERSION < 40199 -// explicit epsilon specializations -#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT -template <> -class epsilon { - public: - constexpr static double value = 0.0009765625F; -}; -#endif // KOKKOS_HALF_T_IS_FLOAT - -// explicit epsilon specializations -#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT -template <> -class epsilon { - public: - constexpr static double value = 0.0078125F; -}; -#endif // KOKKOS_HALF_T_IS_FLOAT -#endif // KOKKOS_VERSION < 40199 - using KokkosKernels::Impl::getRandomBounds; template Date: Wed, 27 Mar 2024 09:39:50 -0600 Subject: [PATCH 206/326] Code for running performance measurements on ger() (#2082) * Correct flop count * Addressing feedbacks from Luc * Using 'zero()' instead of '0.' --- perf_test/blas/blas2/CMakeLists.txt | 9 +- .../KokkosBlas2_ger_perf_test_benchmark.cpp | 347 ++++++++++++++++++ 2 files changed, 355 insertions(+), 1 deletion(-) create mode 100644 perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp diff --git a/perf_test/blas/blas2/CMakeLists.txt b/perf_test/blas/blas2/CMakeLists.txt index 9c2aa424d1..dc106224f0 100644 --- a/perf_test/blas/blas2/CMakeLists.txt +++ b/perf_test/blas/blas2/CMakeLists.txt @@ -8,7 +8,14 @@ KOKKOSKERNELS_ADD_EXECUTABLE( IF(KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( - Blas2_Benchmark + Blas2_gemv_Benchmark SOURCES KokkosBlas2_gemv_perf_test_benchmark.cpp ) ENDIF() + +IF(KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + Blas2_ger_Benchmark + SOURCES KokkosBlas2_ger_perf_test_benchmark.cpp + ) +ENDIF() diff --git a/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp new file mode 100644 index 0000000000..25c08a87b8 --- /dev/null +++ b/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp @@ -0,0 +1,347 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include "KokkosKernels_helpers.hpp" +#include "KokkosBlas2_ger.hpp" +#include + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include +#include + +struct blas2_ger_params : public perf_test::CommonInputParams { + int verbosity = 0; + int m = 5000; + int n = 5000; + bool layoutLeft = true; + std::string scalarType = "double"; + std::string yMode = "transpose"; + + static blas2_ger_params get_params(int& argc, char** argv) { + blas2_ger_params params; + perf_test::parse_common_options(argc, argv, params); + + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--verbosity", + params.verbosity)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (std::string layout; + perf_test::check_arg_str(i, argc, argv, "--layout", layout)) { + if (0 == Test::string_compare_no_case(layout, "left")) + params.layoutLeft = true; + else if (0 == Test::string_compare_no_case(layout, "right")) + params.layoutLeft = false; + else { + std::cerr << "Invalid '--layout': must be 'left' or 'right'.\n"; + exit(1); + } + ++i; + } else if (std::string scalarType; perf_test::check_arg_str( + i, argc, argv, "--scalarType", scalarType)) { + if ((0 == Test::string_compare_no_case(scalarType, "int32")) || + (0 == Test::string_compare_no_case(scalarType, "int64")) || + (0 == Test::string_compare_no_case(scalarType, "float")) || + (0 == Test::string_compare_no_case(scalarType, "double")) || + (0 == Test::string_compare_no_case(scalarType, "complex_float")) || + (0 == Test::string_compare_no_case(scalarType, "complex_double"))) { + params.scalarType = scalarType; + } else { + std::cerr << "Invalid '--scalarType': must be 'int32' or 'int64' or " + "'float' or 'double' or 'complex_float' or " + "'complex_double'.\n"; + exit(1); + } + ++i; + } else if (std::string yMode; + perf_test::check_arg_str(i, argc, argv, "--yMode", yMode)) { + if ((0 == Test::string_compare_no_case(yMode, "transpose")) || + (0 == Test::string_compare_no_case(yMode, "Hermitian"))) { + params.yMode = yMode; + } else { + std::cerr + << "Invalid '--yMode': must be 'transpose' or 'Hermitian'.\n"; + exit(1); + } + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + exit(1); + } + } + return params; + } + + static void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --m :: number of rows to generate (default 5000)" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate (default 5000)" + << std::endl; + std::cerr << "\t[Optional] --layout :: matrix layout ('left' or 'right', " + "default 'left')" + << std::endl; + std::cerr + << "\t[Optional] --scalarType :: scalar type ('int32' or 'int64'" + " or 'float' or 'double' or 'complex_float' or 'complex_double'" + ", default 'double')" + << std::endl; + std::cerr << "\t[Optional] --yMode :: y mode ('transpose' or 'Hermitian'" + ", default 'transpose')" + << std::endl; + } +}; + +template +static void KokkosBlas2_GER(benchmark::State& state) { + const auto verbosity = state.range(0); + const auto m = state.range(1); + const auto n = state.range(2); + const auto yIsTranspose = state.range(3); + tScalar a(Kokkos::ArithTraits::zero()); + + if (verbosity > 0) { + std::cout << "Entering KokkosBlas2_GER()" + << ": m = " << m << ", n = " << n + << ", yIsTranspose = " << yIsTranspose + << ", tScalar = " << Kokkos::ArithTraits::name() + << ", tLayout = " << typeid(tLayout).name() << std::endl; + } + + using MemSpace = typename tExecSpace::memory_space; + using Device = Kokkos::Device; + + Kokkos::View A( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n); + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), m); + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), n); + + Kokkos::Random_XorShift64_Pool pool(123); + + char yMode('t'); + if (!yIsTranspose) yMode = 'H'; + + tScalar rangeValue(Kokkos::ArithTraits::zero()); + if constexpr (Kokkos::ArithTraits::isOrdinal) { + rangeValue = 10; + a = 3; + } else if constexpr (Kokkos::ArithTraits::is_complex) { + rangeValue.real() = 10.; + rangeValue.imag() = 10.; + a = tScalar(2.5, 3.6); + } else { + rangeValue = 10.; + a = 2.5; + } + + // Fill 'A', 'x', and 'y' with samples from an uniform random variable with + // range [1,rangeValue) + Kokkos::fill_random(A, pool, rangeValue); + Kokkos::fill_random(x, pool, rangeValue); + Kokkos::fill_random(y, pool, rangeValue); + + if (verbosity > 0) { + std::cout << "In KokkosBlas2_GER()" + << ": yMode = " << yMode << ", a = " << a + << ", rangeValue = " << rangeValue << std::endl; + } + + // Do a warm-up run + KokkosBlas::ger(&yMode, a, x, y, A); + Kokkos::fence(); + double total_time = 0.0; + + for (auto _ : state) { + // Start timing + Kokkos::Timer timer; + KokkosBlas::ger(&yMode, a, x, y, A); + tExecSpace().fence(); + + double time = timer.seconds(); + total_time += time; + state.SetIterationTime(time); + } + + state.counters[tExecSpace::name()] = 1; + state.counters["Avg GER time (s):"] = + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); + size_t flopsPerRun = (size_t)3 * m * n; + state.counters["Avg GER FLOP/s:"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); + + if (verbosity > 0) { + std::cout << "Leaving KokkosBlas2_GER()" << std::endl; + } +} + +template +void run(const blas2_ger_params& params) { + const auto name = "KokkosBlas2_GER"; + const auto arg_names = std::vector{ + "verbosity", "m", "n", "yMode", + params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; + const auto args = std::vector{params.verbosity, params.m, params.n, + (params.yMode == "transpose"), 1}; + + if (params.layoutLeft) { + if (params.scalarType == "int32") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "int64") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "float") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "double") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "complex_float") { + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_GER, Kokkos::LayoutLeft, + tExecSpace>, + arg_names, args, params.repeat); + } else { + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_GER, Kokkos::LayoutLeft, + tExecSpace>, + arg_names, args, params.repeat); + } + } else { + if (params.scalarType == "int32") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "int64") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "float") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "double") { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GER, + arg_names, args, params.repeat); + } else if (params.scalarType == "complex_float") { + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_GER, Kokkos::LayoutRight, + tExecSpace>, + arg_names, args, params.repeat); + } else { + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_GER, Kokkos::LayoutRight, + tExecSpace>, + arg_names, args, params.repeat); + } + } +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + const auto params = blas2_ger_params::get_params(argc, argv); + + // std::cout << "In main(): params.repeat = " << params.repeat << std::endl; + + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + run(params); +#else + std::cout << "ERROR: PThreads requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + run(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + run(params); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + run(params); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + + // use serial if no backend is specified + if (!params.use_cuda && !params.use_hip && !params.use_openmp && + !params.use_sycl && !params.use_threads) { +#if defined(KOKKOS_ENABLE_SERIAL) + run(params); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + return 0; +} From b04dd151212b93e67692aba65de2e0a48b64e48e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 27 Mar 2024 10:57:10 -0600 Subject: [PATCH 207/326] KokkosBlas1_axpby.hpp: change debug macro guard for printInformation (#2157) * KokkosBlas1_axpby.hpp: change debug macro guard for printInformation - resolves test failures in Trilinos (MueLu) that rely on gold file diff comparisons by removing extra output in debug builds * fix compilation error --- .../impl/KokkosBlas1_axpby_unification_attempt_traits.hpp | 2 +- blas/src/KokkosBlas1_axpby.hpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 1a66637b13..9d200e892d 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -768,7 +768,7 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Routine to print information on input variables and internal variables // ******************************************************************** -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static void printInformation(std::ostream& os, std::string const& headerMsg) { os << headerMsg << ": AV = " << typeid(AV).name() diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 55da4f437a..0f95786dfc 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -17,9 +17,9 @@ #ifndef KOKKOSBLAS1_AXPBY_HPP_ #define KOKKOSBLAS1_AXPBY_HPP_ -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) #include -#endif +#endif // KOKKOSKERNELS_DEBUG_LEVEL #include #include @@ -73,9 +73,9 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // Perform compile time checks and run time checks. // ********************************************************************** AxpbyTraits::performChecks(a, X, b, Y); -#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) AxpbyTraits::printInformation(std::cout, "axpby(), unif information"); -#endif +#endif // KOKKOSKERNELS_DEBUG_LEVEL // ********************************************************************** // Call Impl::Axpby<...>::axpby(...) From 327ab4886ae747c0e5c8c109dabf029da7505f3c Mon Sep 17 00:00:00 2001 From: Wyatt Horne <68676884+wjhorne@users.noreply.github.com> Date: Wed, 27 Mar 2024 12:59:32 -0600 Subject: [PATCH 208/326] Add user tolerance to Serial SVD (#2120) * Add user tolerance to Serial SVD --------- Co-authored-by: whorne Co-authored-by: Carl Pearson --- .../impl/KokkosBatched_SVD_Serial_Impl.hpp | 19 +++++++++---------- .../KokkosBatched_SVD_Serial_Internal.hpp | 18 ++++++++++-------- batched/dense/src/KokkosBatched_SVD_Decl.hpp | 17 +++++++++-------- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp index 20dab77092..a2c345f4fb 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp @@ -24,11 +24,10 @@ namespace KokkosBatched { // Version which computes the full factorization template -KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_USV_Tag, const AViewType &A, - const UViewType &U, - const SViewType &sigma, - const VViewType &Vt, - const WViewType &work) { +KOKKOS_INLINE_FUNCTION int SerialSVD::invoke( + SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &sigma, + const VViewType &Vt, const WViewType &work, + typename AViewType::const_value_type tol) { static_assert(Kokkos::is_view_v && AViewType::rank == 2, "SVD: A must be a rank-2 view"); static_assert(Kokkos::is_view_v && UViewType::rank == 2, @@ -46,14 +45,14 @@ KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_USV_Tag, const AViewType &A, return KokkosBatched::SerialSVDInternal::invoke( A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(), U.stride(0), U.stride(1), Vt.data(), Vt.stride(0), Vt.stride(1), - sigma.data(), sigma.stride(0), work.data()); + sigma.data(), sigma.stride(0), work.data(), tol); } // Version which computes only singular values template -KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_S_Tag, const AViewType &A, - const SViewType &sigma, - const WViewType &work) { +KOKKOS_INLINE_FUNCTION int SerialSVD::invoke( + SVD_S_Tag, const AViewType &A, const SViewType &sigma, + const WViewType &work, typename AViewType::const_value_type tol) { static_assert(Kokkos::is_view_v && AViewType::rank == 2, "SVD: A must be a rank-2 view"); static_assert(Kokkos::is_view_v && SViewType::rank == 1, @@ -66,7 +65,7 @@ KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_S_Tag, const AViewType &A, using value_type = typename AViewType::non_const_value_type; return KokkosBatched::SerialSVDInternal::invoke( A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), nullptr, 0, - 0, nullptr, 0, 0, sigma.data(), sigma.stride(0), work.data()); + 0, nullptr, 0, 0, sigma.data(), sigma.stride(0), work.data(), tol); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 34c92c2d24..87ed65d81e 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -233,7 +233,8 @@ struct SerialSVDInternal { int Bs0, int Bs1, value_type* U, int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, - value_type* sigma, int ss) { + value_type* sigma, int ss, + const value_type& tol) { using KAT = Kokkos::ArithTraits; const value_type eps = Kokkos::ArithTraits::epsilon(); int p = 0; @@ -242,7 +243,8 @@ struct SerialSVDInternal { // Zero out tiny superdiagonal entries for (int i = 0; i < n - 1; i++) { if (fabs(SVDIND(B, i, i + 1)) < - eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1)))) { + eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))) || + fabs(SVDIND(B, i, i + 1)) < tol) { SVDIND(B, i, i + 1) = KAT::zero(); } } @@ -337,11 +339,11 @@ struct SerialSVDInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(int m, int n, value_type* A, int As0, - int As1, value_type* U, int Us0, - int Us1, value_type* Vt, int Vts0, - int Vts1, value_type* sigma, int ss, - value_type* work) { + KOKKOS_INLINE_FUNCTION static int invoke( + int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, + int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss, + value_type* work, + value_type tol = Kokkos::ArithTraits::zero()) { // First, if m < n, need to instead compute (V, s, U^T) = A^T. // This just means swapping U & Vt, and implicitly transposing A, U and Vt. if (m < n) { @@ -366,7 +368,7 @@ struct SerialSVDInternal { return 0; } bidiagonalize(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, work); - bidiSVD(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss); + bidiSVD(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss, tol); postprocessSVD(m, n, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss); return 0; } diff --git a/batched/dense/src/KokkosBatched_SVD_Decl.hpp b/batched/dense/src/KokkosBatched_SVD_Decl.hpp index c5dc5805d9..e84008cb69 100644 --- a/batched/dense/src/KokkosBatched_SVD_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SVD_Decl.hpp @@ -58,17 +58,18 @@ struct SerialSVD { // Version to compute full factorization: A == U * diag(s) * Vt template - KOKKOS_INLINE_FUNCTION static int invoke(SVD_USV_Tag, const AViewType &A, - const UViewType &U, - const SViewType &s, - const VtViewType &Vt, - const WViewType &W); + KOKKOS_INLINE_FUNCTION static int invoke( + SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &s, + const VtViewType &Vt, const WViewType &W, + typename AViewType::const_value_type tol = + Kokkos::ArithTraits::zero()); // Version which computes only singular values template - KOKKOS_INLINE_FUNCTION static int invoke(SVD_S_Tag, const AViewType &A, - const SViewType &s, - const WViewType &W); + KOKKOS_INLINE_FUNCTION static int invoke( + SVD_S_Tag, const AViewType &A, const SViewType &s, const WViewType &W, + typename AViewType::const_value_type tol = + Kokkos::ArithTraits::zero()); }; } // namespace KokkosBatched From 356e2279e55d1ef6970e016888a46417a51d9ec5 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 28 Mar 2024 08:37:58 -0600 Subject: [PATCH 209/326] Add a simple Harwell-Boeing file reader (#2155) * Add a simple Harwell-Boeing file reader And a test that validates against the MM reader. * Support for symmetrize * This loop can be simplified, there's no diag duplication * Improve IO test --- sparse/src/KokkosSparse_IOUtils.hpp | 244 ++++++++++++++++++++- sparse/src/KokkosSparse_Utils.hpp | 257 ++++------------------- sparse/unit_test/Test_Sparse.hpp | 1 + sparse/unit_test/Test_Sparse_IOUtils.hpp | 198 +++++++++++++++++ 4 files changed, 480 insertions(+), 220 deletions(-) create mode 100644 sparse/unit_test/Test_Sparse_IOUtils.hpp diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index 4704a8724c..10e698e170 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -19,6 +19,8 @@ #include "KokkosKernels_IOUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" +#include + namespace KokkosSparse { namespace Impl { @@ -1063,6 +1065,232 @@ int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, return 0; } +/** + * Read a matrix from a file using the Harwell-Boeing Exchange Format + */ +template +int read_hb(const char *fileName, lno_t &nrows, lno_t &ncols, size_type &ne, + size_type **xadj, lno_t **adj, scalar_t **ew) { + using namespace MM; + + std::ifstream mmf(fileName, std::ifstream::in); + if (!mmf.is_open()) { + throw std::runtime_error("File cannot be opened\n"); + } + + // Get the title line, don't need to do anything with that data + std::string fline = ""; + getline(mmf, fline); + + // Get metadata, rhs_lines is optional + getline(mmf, fline); + std::istringstream ss(fline); + size_type total_lines = 0, ptr_lines = 0, col_lines = 0, val_lines = 0, + rhs_lines = 0; + + ss >> total_lines >> ptr_lines >> col_lines >> val_lines >> rhs_lines; + + if (total_lines == 0 || ptr_lines == 0 || col_lines == 0) { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + ", Line 2 did not have valid values"); + } + + if (rhs_lines > 0) { + throw std::runtime_error( + std::string("Problem reading HB file ") + fileName + + ", reader does not support RHS info at this time."); + } + + // Get next line of metadata, neltvl is optional + getline(mmf, fline); + ss = std::istringstream(fline); + std::string matrix_info; + size_type nrow = 0, ncol = 0, nnz_raw = 0, neltvl = 0; + + ss >> matrix_info >> nrow >> ncol >> nnz_raw >> neltvl; + + if (matrix_info.size() != 3 || nrow == 0 || ncol == 0 || nnz_raw == 0) { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + + ", Line 3 did not have valid values: " + fline); + } + + const char matrix_scalar = matrix_info[0]; + const char matrix_type_raw = matrix_info[1]; + const char matrix_assembly = matrix_info[2]; + + // check matrix_scalar matches scalar_t + if (matrix_scalar == 'R') { + if (!(std::is_same::value || + std::is_same::value || + std::is_floating_point::value)) { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + + ", scalar_t in read_hb() incompatible with " + "float or double typed HB file."); + } + } else if (matrix_scalar == 'C') { + if (!(std::is_same>::value || + std::is_same>::value)) { + throw std::runtime_error( + std::string("Problem reading HB file ") + fileName + + ", scalar_t in read_hb() incompatible with complex-typed HB file."); + } + } + if (matrix_assembly != 'A') { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + + ", only assembled matrices are supported."); + } + + // Get next line of metadata + getline(mmf, fline); + ss = std::istringstream(fline); + std::string ptrfmt, indfmt, valfmt, rhsfmt; + ss >> ptrfmt >> indfmt >> valfmt >> rhsfmt; + + if (ptrfmt == "" || indfmt == "" || valfmt == "") { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + ", Line 4 did not have valid values"); + } + + // Examine mtx properties + const bool pattern_only = matrix_scalar == 'P'; + MtxSym matrix_type = MtxSym::GENERAL; + if (matrix_type_raw == 'S') matrix_type = MtxSym::SYMMETRIC; + if (matrix_type_raw == 'H') matrix_type = MtxSym::HERMITIAN; + if (matrix_type_raw == 'Z') matrix_type = MtxSym::SKEW_SYMMETRIC; + const bool symmetrize = matrix_type_raw == 'S' || matrix_type_raw == 'H' || + matrix_type_raw == 'Z'; + + // Allocate temp storage + std::vector raw_rows(nrow + 1); + std::vector raw_cols(nnz_raw); + std::vector raw_vals(nnz_raw); + + // Read row_idx + size_type idx = 0; + for (size_type i = 0; i < ptr_lines; ++i) { + getline(mmf, fline); + ss = std::istringstream(fline); + size_type val; + while (ss >> val) { + raw_rows[idx++] = (val - 1); // HB uses 1-based indexing + } + } + if (idx != nrow + 1) { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + + ", did not find expected number of col ptrs"); + } + + // Read cols + idx = 0; + for (size_type i = 0; i < col_lines; ++i) { + getline(mmf, fline); + ss = std::istringstream(fline); + lno_t val; + while (ss >> val) { + raw_cols[idx++] = (val - 1); // HB uses 1-based indexing + } + } + if (idx != nnz_raw) { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + + ", did not find expected number of cols"); + } + + // Read vals if not pattern only + if (!pattern_only) { + idx = 0; + for (size_type i = 0; i < val_lines; ++i) { + getline(mmf, fline); + // The 'e' before the exponent is needed for the stringstream to read + // the value correctly + fline = std::regex_replace(fline, std::regex("([0-9])([+-])"), "$1e$2"); + ss = std::istringstream(fline); + while (ss) { + auto val = readScalar(ss); + // ss will be false if we read past the end + if (ss) { + raw_vals[idx++] = val; + } + } + } + if (idx != nnz_raw) { + throw std::runtime_error(std::string("Problem reading HB file ") + + fileName + + ", did not find expected number of values"); + } + } else { + // Initialize to one + for (size_type i = 0; i < nnz_raw; ++i) { + raw_vals[i] = Kokkos::ArithTraits::one(); + } + } + + // Process raw data + size_type nnz = 0; // real nnz, differs from nnz_raw if symmetrize + if (symmetrize) { + const size_type numEdges = 2 * nnz_raw; + // numEdges is only an upper bound (diagonal entries may be removed) + std::vector> edges( + numEdges); + for (size_type row_idx = 0; row_idx < nrow; ++row_idx) { + const size_type row_nnz_begin = raw_rows[row_idx]; + const size_type row_nnz_end = raw_rows[row_idx + 1]; + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; + ++row_nnz) { + const lno_t col_idx = raw_cols[row_nnz]; + const scalar_t val = raw_vals[row_nnz]; + struct KokkosKernels::Impl::Edge tmp = {(lno_t)row_idx, + col_idx, val}, + tmp2 = { + col_idx, (lno_t)row_idx, symmetryFlip(val, matrix_type) + }; // symmetric edge + edges[nnz++] = tmp; + if (row_idx != (size_type)col_idx) { // non-diagonal + edges[nnz++] = tmp2; + } + } + } + std::sort(edges.begin(), edges.begin() + nnz); + + KokkosKernels::Impl::md_malloc(xadj, nrow + 1); + KokkosKernels::Impl::md_malloc(adj, nnz); + KokkosKernels::Impl::md_malloc(ew, nnz); + + size_type curr_nnz = 0; + for (size_type i = 0; i < nrow; ++i) { + (*xadj)[i] = curr_nnz; + while (curr_nnz < nnz && + static_cast(edges[curr_nnz].src) == i) { + (*adj)[curr_nnz] = edges[curr_nnz].dst; + (*ew)[curr_nnz] = edges[curr_nnz].ew; + ++curr_nnz; + } + } + (*xadj)[nrow] = nnz; + } else { + KokkosKernels::Impl::md_malloc(xadj, nrow + 1); + KokkosKernels::Impl::md_malloc(adj, nnz_raw); + KokkosKernels::Impl::md_malloc(ew, nnz_raw); + + std::memcpy(*xadj, raw_rows.data(), raw_rows.size() * sizeof(size_type)); + std::memcpy(*adj, raw_cols.data(), raw_cols.size() * sizeof(lno_t)); + std::memcpy(*ew, raw_vals.data(), raw_vals.size() * sizeof(scalar_t)); + + nnz = nnz_raw; + } + + // Set outputs + nrows = nrow; + ncols = ncol; + ne = nnz; + + return 0; +} + // Version of read_mtx which does not capture the number of columns. // This is the old interface; it's kept for backwards compatibility. template @@ -1084,6 +1312,12 @@ void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false); } + else if (KokkosKernels::Impl::endswith(strfilename, ".rsa") || + KokkosKernels::Impl::endswith(strfilename, ".hb")) { + lno_t ncol; // will discard + read_hb(filename, *nv, ncol, *ne, xadj, adj, ew); + } + else if (KokkosKernels::Impl::endswith(strfilename, ".bin")) { read_graph_bin(nv, ne, xadj, adj, ew, filename); } @@ -1102,7 +1336,8 @@ crsMat_t read_kokkos_crst_matrix(const char *filename_) { std::string strfilename(filename_); bool isMatrixMarket = KokkosKernels::Impl::endswith(strfilename, ".mtx") || KokkosKernels::Impl::endswith(strfilename, ".mm"); - + bool isHB = KokkosKernels::Impl::endswith(strfilename, ".rsa") || + KokkosKernels::Impl::endswith(strfilename, ".hb"); typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type::non_const_type row_map_view_t; typedef typename graph_t::entries_type::non_const_type cols_view_t; @@ -1117,9 +1352,12 @@ crsMat_t read_kokkos_crst_matrix(const char *filename_) { scalar_t *values; if (isMatrixMarket) { - // MatrixMarket file contains the exact number of columns + // MatrixMarket and HBE files contain the exact number of columns read_mtx(filename_, &nr, &nc, &nnzA, &xadj, &adj, &values, false, false, false); + } else if (isHB) { + read_hb(filename_, nr, nc, nnzA, &xadj, &adj, + &values); } else { //.crs and .bin files don't contain #cols, so will compute it later based on // the entries @@ -1146,7 +1384,7 @@ crsMat_t read_kokkos_crst_matrix(const char *filename_) { Kokkos::deep_copy(values_view, hv); } - if (!isMatrixMarket) { + if (!(isMatrixMarket || isHB)) { KokkosKernels::Impl::kk_view_reduce_max( nnzA, columns_view, nc); diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 2b89c1a2f7..876ba0fc58 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -1001,33 +1001,6 @@ void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree, max_degree = result.max_val; } -template -void kk_get_lower_triangle_count_sequential(const lno_t nv, - const size_type *in_xadj, - const lno_t *in_adj, - size_type *out_xadj, - const lno_t *new_indices = NULL) { - for (lno_t i = 0; i < nv; ++i) { - lno_t row_index = i; - - if (new_indices) row_index = new_indices[i]; - - out_xadj[i] = 0; - size_type begin = in_xadj[i]; - lno_t rowsize = in_xadj[i + 1] - begin; - - for (lno_t j = 0; j < rowsize; ++j) { - lno_t col = in_adj[j + begin]; - lno_t col_index = col; - if (new_indices) col_index = new_indices[col]; - - if (row_index > col_index) { - ++out_xadj[i]; - } - } - } -} - template struct LowerTriangularMatrix { @@ -1063,12 +1036,14 @@ struct LowerTriangularMatrix { const lno_t team_work_size; const KokkosKernels::Impl::ExecSpaceType exec_space; const bool is_lower; + const bool incl_diag; LowerTriangularMatrix(const lno_t num_rows_, const size_type *xadj_, const lno_t *adj_, const scalar_t *in_vals_, const lno_t *permutation_, size_type *t_xadj_, lno_t *t_adj_, scalar_t *out_vals_, - const lno_t team_row_work_size_, bool is_lower_ = true) + const lno_t team_row_work_size_, bool is_lower_ = true, + bool incl_diag_ = false) : num_rows(num_rows_), xadj(xadj_), adj(adj_), @@ -1080,7 +1055,8 @@ struct LowerTriangularMatrix { team_work_size(team_row_work_size_), exec_space( KokkosKernels::Impl::kk_get_exec_space_type()), - is_lower(is_lower_) {} + is_lower(is_lower_), + incl_diag(incl_diag_) {} KOKKOS_INLINE_FUNCTION void operator()(const CountTag &, @@ -1109,14 +1085,10 @@ struct LowerTriangularMatrix { if (permutation != NULL) { colIndex = permutation[colIndex]; } - if (is_lower) { - if (row_perm > colIndex) { - rowsize_ += 1; - } - } else { - if (row_perm < colIndex) { - rowsize_ += 1; - } + if ((is_lower && row_perm > colIndex) || + (!is_lower && row_perm < colIndex) || + (incl_diag && row_perm == colIndex)) { + rowsize_ += 1; } }, lower_row_size); @@ -1170,20 +1142,13 @@ struct LowerTriangularMatrix { if (permutation != NULL) { colperm = permutation[colIndex]; } - if (is_lower) { - if (row_perm > colperm) { - if (in_vals != NULL) { - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; - } - } else { - if (row_perm < colperm) { - if (in_vals != NULL) { - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; + if ((is_lower && row_perm > colperm) || + (!is_lower && row_perm < colperm) || + (incl_diag && row_perm == colperm)) { + if (in_vals != NULL) { + t_vals[write_begin + w] = in_vals[adjind]; } + t_adj[write_begin + w++] = colIndex; } } }); @@ -1194,7 +1159,7 @@ void kk_get_lower_triangle_count_parallel( const lno_t nv, const size_type ne, const size_type *in_xadj, const lno_t *in_adj, size_type *out_xadj, const lno_t *new_indices = NULL, bool use_dynamic_scheduling = false, int chunksize = 4, - bool is_lower = true) { + bool is_lower = true, bool incl_diag = false) { const int vector_size = kk_get_suggested_vector_size( nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( @@ -1204,7 +1169,7 @@ void kk_get_lower_triangle_count_parallel( typedef LowerTriangularMatrix ltm_t; ltm_t ltm(nv, in_xadj, in_adj, NULL, new_indices, out_xadj, NULL, NULL, - team_work_chunk_size, is_lower); + team_work_chunk_size, is_lower, incl_diag); typedef typename ltm_t::team_count_policy_t count_tp_t; typedef typename ltm_t::dynamic_team_count_policy_t d_count_tp_t; @@ -1360,7 +1325,7 @@ void kk_get_lower_triangle_fill_parallel( const lno_t *in_adj, const scalar_t *in_vals, size_type *out_xadj, lno_t *out_adj, scalar_t *out_vals, const lno_t *new_indices = NULL, bool use_dynamic_scheduling = false, bool chunksize = 4, - bool is_lower = true) { + bool is_lower = true, bool incl_diag = false) { const int vector_size = kk_get_suggested_vector_size( nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( @@ -1371,7 +1336,7 @@ void kk_get_lower_triangle_fill_parallel( typedef LowerTriangularMatrix ltm_t; ltm_t ltm(nv, in_xadj, in_adj, in_vals, new_indices, out_xadj, out_adj, - out_vals, team_work_chunk_size, is_lower); + out_vals, team_work_chunk_size, is_lower, incl_diag); typedef typename ltm_t::team_fill_policy_t fill_p_t; typedef typename ltm_t::dynamic_team_fill_policy_t d_fill_p_t; @@ -1391,48 +1356,20 @@ void kk_get_lower_triangle_fill_parallel( } ExecutionSpace().fence(); } -template -void kk_get_lower_triangle_fill_sequential(lno_t nv, const size_type *in_xadj, - const lno_t *in_adj, - const scalar_t *in_vals, - const size_type *out_xadj, - lno_t *out_adj, scalar_t *out_vals, - const lno_t *new_indices = NULL) { - for (lno_t i = 0; i < nv; ++i) { - lno_t row_index = i; - - if (new_indices) row_index = new_indices[i]; - size_type write_index = out_xadj[i]; - size_type begin = in_xadj[i]; - lno_t rowsize = in_xadj[i + 1] - begin; - for (lno_t j = 0; j < rowsize; ++j) { - lno_t col = in_adj[j + begin]; - lno_t col_index = col; - if (new_indices) col_index = new_indices[col]; - - if (row_index > col_index) { - if (in_vals != NULL && out_vals != NULL) { - out_vals[write_index] = in_vals[j + begin]; - } - out_adj[write_index++] = col; - } - } - } -} + template void kk_get_lower_triangle_count(const lno_t nv, const size_type ne, const size_type *in_xadj, const lno_t *in_adj, size_type *out_xadj, const lno_t *new_indices = NULL, bool use_dynamic_scheduling = false, - bool chunksize = 4, bool is_lower = true) { + bool chunksize = 4, bool is_lower = true, + bool incl_diag = false) { // Kokkos::Timer timer1; - // kk_get_lower_triangle_count_sequential(nv, in_xadj, in_adj, out_xadj, - // new_indices); kk_get_lower_triangle_count_parallel( nv, ne, in_xadj, in_adj, out_xadj, new_indices, use_dynamic_scheduling, - chunksize, is_lower); + chunksize, is_lower, incl_diag); // double count = timer1.seconds(); // std::cout << "lower count time:" << count<< std::endl; } @@ -1444,23 +1381,14 @@ void kk_get_lower_triangle_fill(lno_t nv, size_type ne, lno_t *out_adj, scalar_t *out_vals, const lno_t *new_indices = NULL, bool use_dynamic_scheduling = false, - bool chunksize = 4, bool is_lower = true) { + bool chunksize = 4, bool is_lower = true, + bool incl_diag = false) { // Kokkos::Timer timer1; - /* - kk_get_lower_triangle_fill_sequential( - nv, in_xadj, in_adj, - in_vals, - out_xadj, - out_adj, - out_vals, - new_indices - ); - */ kk_get_lower_triangle_fill_parallel( nv, ne, in_xadj, in_adj, in_vals, out_xadj, out_adj, out_vals, - new_indices, use_dynamic_scheduling, chunksize, is_lower); + new_indices, use_dynamic_scheduling, chunksize, is_lower, incl_diag); // double fill = timer1.seconds(); // std::cout << "lower fill time:" << fill<< std::endl; @@ -1470,64 +1398,8 @@ template crstmat_t kk_get_lower_triangle( crstmat_t in_crs_matrix, typename crstmat_t::index_type::value_type *new_indices = NULL, - bool use_dynamic_scheduling = false, bool chunksize = 4) { - typedef typename crstmat_t::execution_space exec_space; - typedef typename crstmat_t::StaticCrsGraphType graph_t; - typedef typename crstmat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crstmat_t::index_type::non_const_type cols_view_t; - typedef typename crstmat_t::values_type::non_const_type values_view_t; - // typedef typename crstmat_t::row_map_type::const_type const_row_map_view_t; - // typedef typename crstmat_t::index_type::const_type const_cols_view_t; - // typedef typename crstmat_t::values_type::const_type const_values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - - lno_t nr = in_crs_matrix.numRows(); - - const scalar_t *vals = in_crs_matrix.values.data(); - const size_type *rowmap = in_crs_matrix.graph.row_map.data(); - const lno_t *entries = in_crs_matrix.graph.entries.data(); - const size_type ne = in_crs_matrix.graph.entries.extent(0); - - row_map_view_t new_row_map( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1); - kk_get_lower_triangle_count( - nr, ne, rowmap, entries, new_row_map.data(), new_indices, - use_dynamic_scheduling, chunksize); - - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nr + 1, new_row_map); - exec_space().fence(); - - auto ll_size = Kokkos::subview(new_row_map, nr); - auto h_ll_size = Kokkos::create_mirror_view(ll_size); - Kokkos::deep_copy(h_ll_size, ll_size); - size_type ll_nnz_size = h_ll_size(); - - // cols_view_t new_entries ("LL", ll_nnz_size); - // values_view_t new_values ("LL", ll_nnz_size); - cols_view_t new_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), - ll_nnz_size); - values_view_t new_values( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size); - - kk_get_lower_triangle_fill( - nr, ne, rowmap, entries, vals, new_row_map.data(), new_entries.data(), - new_values.data(), new_indices, use_dynamic_scheduling, chunksize); - - graph_t g(new_entries, new_row_map); - crstmat_t new_ll_mtx("lower triangle", in_crs_matrix.numCols(), new_values, - g); - return new_ll_mtx; -} - -template -crstmat_t kk_get_lower_crs_matrix( - crstmat_t in_crs_matrix, - typename crstmat_t::index_type::value_type *new_indices = NULL, - bool use_dynamic_scheduling = false, bool chunksize = 4) { + bool use_dynamic_scheduling = false, bool chunksize = 4, + bool is_lower = true, bool incl_diag = false) { typedef typename crstmat_t::execution_space exec_space; typedef typename crstmat_t::StaticCrsGraphType graph_t; typedef typename crstmat_t::row_map_type::non_const_type row_map_view_t; @@ -1552,7 +1424,7 @@ crstmat_t kk_get_lower_crs_matrix( Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1); kk_get_lower_triangle_count( nr, ne, rowmap, entries, new_row_map.data(), new_indices, - use_dynamic_scheduling, chunksize); + use_dynamic_scheduling, chunksize, is_lower, incl_diag); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nr + 1, new_row_map); @@ -1572,7 +1444,8 @@ crstmat_t kk_get_lower_crs_matrix( kk_get_lower_triangle_fill( nr, ne, rowmap, entries, vals, new_row_map.data(), new_entries.data(), - new_values.data(), new_indices, use_dynamic_scheduling, chunksize); + new_values.data(), new_indices, use_dynamic_scheduling, chunksize, + is_lower, incl_diag); graph_t g(new_entries, new_row_map); crstmat_t new_ll_mtx("lower triangle", in_crs_matrix.numCols(), new_values, @@ -1580,67 +1453,17 @@ crstmat_t kk_get_lower_crs_matrix( return new_ll_mtx; } -template -graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix, - typename graph_t::data_type *new_indices = NULL, - bool /*use_dynamic_scheduling*/ = false, - bool /*chunksize*/ = 4) { - typedef typename graph_t::execution_space exec_space; - - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - - // typedef typename graph_t::row_map_type::const_type const_row_map_view_t; - // typedef typename graph_t::entries_type::const_type const_cols_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - - lno_t nr = in_crs_matrix.numRows(); - const size_type *rowmap = in_crs_matrix.row_map.data(); - const lno_t *entries = in_crs_matrix.entries.data(); - - const size_type ne = in_crs_matrix.graph.entries.extent(0); - - row_map_view_t new_row_map( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1); - kk_get_lower_triangle_count( - nr, ne, rowmap, entries, new_row_map.data(), new_indices); - - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nr + 1, new_row_map); - exec_space().fence(); - - auto ll_size = Kokkos::subview(new_row_map, nr); - auto h_ll_size = Kokkos::create_mirror_view(ll_size); - Kokkos::deep_copy(h_ll_size, ll_size); - size_type ll_nnz_size = h_ll_size(); - - cols_view_t new_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), - ll_nnz_size); - - kk_get_lower_triangle_fill( - nr, ne, rowmap, entries, NULL, new_row_map.data(), new_entries.data(), - NULL, new_indices); - - graph_t g(new_entries, new_row_map); - - return g; -} - template -void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, - row_map_view_t in_rowmap, cols_view_t in_entries, - values_view_t in_values, - out_row_map_view_t &out_rowmap, - out_cols_view_t &out_entries, - out_values_view_t &out_values, - new_indices_t &new_indices, - bool use_dynamic_scheduling = false, - bool chunksize = 4, bool is_lower = true) { +void kk_get_lower_triangle( + typename cols_view_t::non_const_value_type nr, row_map_view_t in_rowmap, + cols_view_t in_entries, values_view_t in_values, + out_row_map_view_t &out_rowmap, out_cols_view_t &out_entries, + out_values_view_t &out_values, new_indices_t &new_indices, + bool use_dynamic_scheduling = false, bool chunksize = 4, + bool is_lower = true, bool incl_diag = false) { // typedef typename row_map_view_t::const_type const_row_map_view_t; // typedef typename cols_view_t::const_type const_cols_view_t; // typedef typename values_view_t::const_type const_values_view_t; @@ -1658,7 +1481,7 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1); kk_get_lower_triangle_count( nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(), - use_dynamic_scheduling, chunksize, is_lower); + use_dynamic_scheduling, chunksize, is_lower, incl_diag); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, out_rowmap); @@ -1681,7 +1504,7 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, kk_get_lower_triangle_fill( nr, ne, rowmap, entries, vals, out_rowmap.data(), out_entries.data(), out_values.data(), new_indices.data(), use_dynamic_scheduling, chunksize, - is_lower); + is_lower, incl_diag); } template + +namespace Test { + +struct TestIOUtils { + using size_type = size_t; + using lno_t = int; + using scalar_t = double; + + using exe_space = Kokkos::DefaultHostExecutionSpace; + using mem_space = typename exe_space::memory_space; + using host_device = Kokkos::Device; + + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + + using sp_matrix_type = + KokkosSparse::CrsMatrix; + + static std::vector> get_sym_fixture() { + std::vector> A = { + {11.00, 12.00, 13.00, 14.00, 15.00, 16.00}, + {12.00, 2.00, 0.00, 0.00, 0.00, 0.00}, + {13.00, 0.00, 0.00, 0.00, 0.00, 0.00}, + {14.00, 0.00, 0.00, 4.00, 0.00, 0.00}, + {15.00, 0.00, 0.00, 0.00, 5.00, 0.00}, + {16.00, 0.00, 0.00, 0.00, 0.00, 6.00}}; + return A; + } + + static std::vector> get_asym_fixture() { + std::vector> A = { + {1.00, 0.00, 0.00, 9.00, 0.00, 0.00}, + {0.00, 2.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 0.00, 0.00, 8.00}, + {0.00, 0.00, 0.00, 4.00, 0.00, 0.00}, + {0.00, 7.00, 0.00, 0.00, 5.00, 0.00}, + {0.00, 0.00, 0.00, 0.00, 0.00, 6.00}}; + return A; + } + + static void compare_matrices(const sp_matrix_type& A1, + const sp_matrix_type& A2) { + // Compare matrices + auto row_map1 = A1.graph.row_map; + auto entries1 = A1.graph.entries; + auto values1 = A1.values; + auto row_map2 = A2.graph.row_map; + auto entries2 = A2.graph.entries; + auto values2 = A2.values; + ASSERT_EQ(row_map1.size(), row_map2.size()); + ASSERT_EQ(entries1.size(), entries2.size()); + ASSERT_EQ(values1.size(), values2.size()); + ASSERT_EQ(values1.size(), entries1.size()); + for (size_type i = 0; i < row_map1.size(); ++i) { + EXPECT_EQ(row_map1(i), row_map2(i)); + } + for (size_type i = 0; i < entries1.size(); ++i) { + EXPECT_EQ(entries1(i), entries2(i)); + EXPECT_EQ(values1(i), values2(i)); + } + } + + template + static void write_as_hb(const RowMapView& row_map, const EntriesView& entries, + const ValuesView& values, const std::string& filename, + const char mtx_type) { + std::ofstream out(filename); + size_type nrows = row_map.size() - 1; + size_type nnz = entries.size(); + + out << "1SYMMETRIC MATRIX, FE APPROXIMATION TO BIHARMONIC OPERATOR ON " + "BEAM. NOS1 \n"; // Title is inaccurate, but doesn't matter + out << " 3 1 1 1 " + " 0 \n"; + out << "R" << mtx_type << "A " << nrows + << " " << nrows << " " << nnz + << " 0 \n"; + out << "(16I5) (16I5) (5E16.8) " + " \n"; + for (size_type row_idx = 0; row_idx < nrows + 1; ++row_idx) { + out << row_map(row_idx) + 1 << " "; + } + out << "\n"; + for (size_type n = 0; n < nnz; ++n) { + out << entries[n] + 1 << " "; + } + out << "\n"; + for (size_type n = 0; n < nnz; ++n) { + out << values[n] << " "; + } + out << "\n"; + + out.close(); + } + + template + static void write_as_mtx(const RowMapView& row_map, + const EntriesView& entries, const ValuesView& values, + const std::string& filename, const char mtx_type) { + std::ofstream out(filename); + size_type nrows = row_map.size() - 1; + + std::map type_name_map = {{'U', "general"}, + {'S', "symmetric"}, + {'H', "hermitian"}, + {'Z', "skew-symmetric"}}; + std::string type_name = type_name_map[mtx_type]; + + out << "%%MatrixMarket matrix coordinate real " << type_name << "\n"; + out << nrows << " " << nrows << " " << entries.size() << "\n"; + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + const size_type row_nnz_begin = row_map(row_idx); + const size_type row_nnz_end = row_map(row_idx + 1); + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; + ++row_nnz) { + const auto col_idx = entries(row_nnz); + const scalar_t value = values(row_nnz); + out << row_idx + 1 << " " << col_idx + 1 << " " << value << "\n"; + } + } + + out.close(); + } + + static void full_test(const std::vector>& fixture, + const std::string& filename_root, const char mtx_type) { + RowMapType row_map; + EntriesType entries; + ValuesType values; + compress_matrix(row_map, entries, values, fixture); + sp_matrix_type A("A", row_map.size() - 1, row_map.size() - 1, + values.extent(0), values, row_map, entries); + const bool is_symmetric = mtx_type != 'U'; + std::string hb_file = filename_root + ".hb"; + std::string mtx_file = filename_root + ".mtx"; + + if (is_symmetric) { + sp_matrix_type L = KokkosSparse::Impl::kk_get_lower_triangle( + A, NULL, false, 4, true, true); + auto lrow_map = L.graph.row_map; + auto lentries = L.graph.entries; + auto lvalues = L.values; + + write_as_hb(lrow_map, lentries, lvalues, hb_file, mtx_type); + write_as_mtx(lrow_map, lentries, lvalues, mtx_file, mtx_type); + } else { + write_as_hb(row_map, entries, values, hb_file, mtx_type); + write_as_mtx(row_map, entries, values, mtx_file, mtx_type); + } + + auto Ahb = KokkosSparse::Impl::read_kokkos_crst_matrix( + hb_file.c_str()); + auto Amtx = KokkosSparse::Impl::read_kokkos_crst_matrix( + mtx_file.c_str()); + if (mtx_type == 'Z') { + compare_matrices(Ahb, Amtx); + } else { + compare_matrices(Ahb, A); + compare_matrices(Amtx, A); + } + } + + static void test() { + const std::string filename_root = "test_sparse_ioutils"; + auto sym_fix = get_sym_fixture(); + auto asym_fix = get_asym_fixture(); + full_test(asym_fix, filename_root + "_asym", 'U'); + full_test(sym_fix, filename_root + "_sym", 'S'); + full_test(sym_fix, filename_root + "_herm", 'H'); + full_test(sym_fix, filename_root + "_skew", 'Z'); + } +}; + +// Test randomly generated Cs matrices +TEST_F(TestCategory, sparse_ioutils) { TestIOUtils::test(); } + +} // namespace Test From 23413f59e867e68798e25e24565557ca23c92ea8 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 28 Mar 2024 14:06:08 -0600 Subject: [PATCH 210/326] spmv tpls: use correct bool for eti template param (#2160) (It should just be the default, KokkosSpars::Impl::eti_spec_avail<..>::value) --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 138 +++++-------- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 65 +++--- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 185 +++++++----------- 3 files changed, 138 insertions(+), 250 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 9c844ff910..72062c26fb 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -187,7 +187,7 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, } } -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE) \ template <> \ struct SPMV_BSRMATRIX< \ EXECSPACE, \ @@ -204,7 +204,7 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ + true> { \ using device_type = Kokkos::Device; \ using Handle = \ KokkosSparse::Impl::SPMVHandleImpl, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP) #endif #undef KOKKOSSPARSE_SPMV_MKL -#define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE) \ template <> \ struct SPMV_MV_BSRMATRIX< \ EXECSPACE, \ @@ -275,7 +269,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, true, COMPILE_LIBRARY> { \ + true, true> { \ using device_type = Kokkos::Device; \ using Handle = \ KokkosSparse::Impl::SPMVHandleImpl, Kokkos::OpenMP, }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::Serial) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP) #endif #undef KOKKOSSPARSE_SPMV_MV_MKL @@ -585,8 +571,7 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, } } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE) \ template <> \ struct SPMV_BSRMATRIX< \ Kokkos::Cuda, \ @@ -600,7 +585,7 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ + true> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using Handle = \ @@ -632,53 +617,37 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, }; KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #undef KOKKOSSPARSE_SPMV_CUSPARSE @@ -1021,8 +990,7 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, #endif } // spmv_bsr_rocsparse -#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE) \ template <> \ struct SPMV_BSRMATRIX< \ Kokkos::HIP, \ @@ -1036,7 +1004,7 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ + true> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using Handle = \ @@ -1068,31 +1036,25 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, }; KOKKOSSPARSE_SPMV_ROCSPARSE(float, rocsparse_int, rocsparse_int, - Kokkos::LayoutLeft, Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + Kokkos::LayoutLeft, Kokkos::HIPSpace); KOKKOSSPARSE_SPMV_ROCSPARSE(float, rocsparse_int, rocsparse_int, - Kokkos::LayoutRight, Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + Kokkos::LayoutRight, Kokkos::HIPSpace); KOKKOSSPARSE_SPMV_ROCSPARSE(double, rocsparse_int, rocsparse_int, - Kokkos::LayoutLeft, Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + Kokkos::LayoutLeft, Kokkos::HIPSpace); KOKKOSSPARSE_SPMV_ROCSPARSE(double, rocsparse_int, rocsparse_int, - Kokkos::LayoutRight, Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + Kokkos::LayoutRight, Kokkos::HIPSpace); KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, - rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + rocsparse_int, Kokkos::LayoutLeft, + Kokkos::HIPSpace); KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, rocsparse_int, Kokkos::LayoutRight, - Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + Kokkos::HIPSpace); KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, - rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + rocsparse_int, Kokkos::LayoutLeft, + Kokkos::HIPSpace); KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, rocsparse_int, Kokkos::LayoutRight, - Kokkos::HIPSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + Kokkos::HIPSpace); #undef KOKKOSSPARSE_SPMV_ROCSPARSE diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 47b7d47f8e..2ccfd89d73 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -220,8 +220,7 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecY)); } -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ - COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE) \ template <> \ struct SPMV_MV< \ Kokkos::Cuda, \ @@ -235,7 +234,7 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - false, true, COMPILE_LIBRARY> { \ + false, true> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using Handle = \ @@ -268,87 +267,67 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, col-major or row-major for X (see note below) 32-bit indices for matrix A */ KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, Kokkos::LayoutRight, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, Kokkos::LayoutRight, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) #endif diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 926d201a52..a11fdf68b2 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -218,8 +218,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], #endif // CUDA_VERSION } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE) \ template <> \ struct SPMV< \ Kokkos::Cuda, \ @@ -233,7 +232,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ + true> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using Handle = \ @@ -262,103 +261,71 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) #endif // 9000 <= CUDA_VERSION @@ -499,7 +466,7 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecX)); } -#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT) \ template <> \ struct SPMV< \ Kokkos::HIP, \ @@ -517,7 +484,7 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ + true> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using Handle = KokkosSparse::Impl::SPMVHandleImpl< \ @@ -544,22 +511,14 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], } \ }; -KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutLeft, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutRight, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_ROCSPARSE(float, Kokkos::LayoutLeft, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_ROCSPARSE(float, Kokkos::LayoutRight, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutLeft, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutLeft, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutLeft) +KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutRight) +KOKKOSSPARSE_SPMV_ROCSPARSE(float, Kokkos::LayoutLeft) +KOKKOSSPARSE_SPMV_ROCSPARSE(float, Kokkos::LayoutRight) +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight) +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight) #undef KOKKOSSPARSE_SPMV_ROCSPARSE @@ -652,7 +611,7 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, // Note: classic MKL runs on Serial/OpenMP but can't use our execution space // instances -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE) \ template <> \ struct SPMV, \ Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ + true> { \ using device_type = Kokkos::Device; \ using Handle = \ KokkosSparse::Impl::SPMVHandleImpl, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP) #endif #undef KOKKOSSPARSE_SPMV_MKL @@ -787,7 +740,7 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle, reinterpret_cast(y.data())); } -#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ +#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ template <> \ struct SPMV< \ Kokkos::Experimental::SYCL, \ @@ -804,7 +757,7 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle, Kokkos::View, \ Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ + true> { \ using execution_space = Kokkos::Experimental::SYCL; \ using device_type = Kokkos::Device; \ using Handle = KokkosSparse::Impl::SPMVHandleImpl< \ @@ -833,33 +786,27 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle, }; KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_ONEMKL(double, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace) /* KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace) */ KOKKOSSPARSE_SPMV_ONEMKL(float, std::int64_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_ONEMKL(double, std::int64_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace) /* KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace + ) KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace + ) */ #endif } // namespace Impl From 02ea952c2fcf4ab3e01b888cc23e0c66feef4492 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 1 Apr 2024 08:44:17 -0600 Subject: [PATCH 211/326] Fix sparse_ioutils test on kokkos-dev (#2162) Adding a ss.sync call fixes it but I have no idea why it was needed here. All the other stringstream reads work fine. --- sparse/src/KokkosSparse_IOUtils.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index 10e698e170..36a213b83e 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -1089,7 +1089,7 @@ int read_hb(const char *fileName, lno_t &nrows, lno_t &ncols, size_type &ne, rhs_lines = 0; ss >> total_lines >> ptr_lines >> col_lines >> val_lines >> rhs_lines; - + ss.sync(); // This fixes tests on kokkos-dev/ipcp. if (total_lines == 0 || ptr_lines == 0 || col_lines == 0) { throw std::runtime_error(std::string("Problem reading HB file ") + fileName + ", Line 2 did not have valid values"); @@ -1108,7 +1108,6 @@ int read_hb(const char *fileName, lno_t &nrows, lno_t &ncols, size_type &ne, size_type nrow = 0, ncol = 0, nnz_raw = 0, neltvl = 0; ss >> matrix_info >> nrow >> ncol >> nnz_raw >> neltvl; - if (matrix_info.size() != 3 || nrow == 0 || ncol == 0 || nnz_raw == 0) { throw std::runtime_error(std::string("Problem reading HB file ") + fileName + From ae3ecca0f4885692038b4016953a1a75a87ff72c Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 3 Apr 2024 12:09:40 -0600 Subject: [PATCH 212/326] Fix #2156 (#2164) spmv: add special path for rank-2 x/y, but where both have 1 column and a TPL is available for rank-1 but not rank-2. Also call "subhandle->set_exec_space" correctly in the TPLs to ensure proper synchronization between setup, spmv and cleanup (in the case that different exec instances are used in different calls) --- sparse/src/KokkosSparse_spmv.hpp | 54 +++++++++++++++++++ sparse/src/KokkosSparse_spmv_handle.hpp | 5 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 5 ++ .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 1 + .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 6 +++ 5 files changed, 68 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 2391291695..f11b61f675 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -40,6 +40,31 @@ struct RANK_ONE {}; struct RANK_TWO {}; } // namespace +namespace Impl { +template +inline constexpr bool spmv_general_tpl_avail() { + constexpr bool isBSR = ::KokkosSparse::Experimental::is_bsr_matrix_v; + if constexpr (!isBSR) { + // CRS + if constexpr (XVector::rank() == 1) + return spmv_tpl_spec_avail::value; + else + return spmv_mv_tpl_spec_avail::value; + } else { + // BSR + if constexpr (XVector::rank() == 1) + return spmv_bsrmatrix_tpl_spec_avail::value; + else + return spmv_mv_bsrmatrix_tpl_spec_avail::value; + } +} +} // namespace Impl + // clang-format off /// \brief Kokkos sparse matrix-vector multiply. /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is @@ -221,6 +246,35 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename YVector::device_type, Kokkos::MemoryTraits>; + // Special case: XVector/YVector are rank-2 but x,y both have one column and + // are contiguous. If a TPL is available for rank-1 vectors but not rank-2, + // take rank-1 subviews of x,y and call the rank-1 version. + if constexpr (XVector::rank() == 2) { + using XVector_SubInternal = Kokkos::View< + typename XVector::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits>; + using YVector_SubInternal = Kokkos::View< + typename YVector::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; + if constexpr (!Impl::spmv_general_tpl_avail< + ExecutionSpace, HandleImpl, AMatrix_Internal, + XVector_Internal, YVector_Internal>() && + Impl::spmv_general_tpl_avail< + ExecutionSpace, HandleImpl, AMatrix_Internal, + XVector_SubInternal, YVector_SubInternal>()) { + if (x.extent(1) == size_t(1) && x.span_is_contiguous() && + y.span_is_contiguous()) { + XVector_SubInternal xsub(x.data(), x.extent(0)); + YVector_SubInternal ysub(y.data(), y.extent(0)); + spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub); + return; + } + } + } + XVector_Internal x_i(x); YVector_Internal y_i(y); diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index 9e7295c72c..a2eecfd1ce 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -237,9 +237,8 @@ struct SPMVHandleImpl { ~SPMVHandleImpl() { if (tpl) delete tpl; } - void set_exec_space(const ExecutionSpace& exec) { - if (tpl) tpl->set_exec_space(exec); - } + + ImplType* get_impl() { return this; } /// Get the SPMVAlgorithm used by this handle SPMVAlgorithm get_algorithm() const { return this->algo; } diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 72062c26fb..188bc5580d 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -48,6 +48,7 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + subhandle->set_exec_space(exec); } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. @@ -127,6 +128,7 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + subhandle->set_exec_space(exec); } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. @@ -378,6 +380,7 @@ void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); @@ -505,6 +508,7 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); @@ -855,6 +859,7 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for rocsparse BSR"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); handle->tpl = subhandle; diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 2ccfd89d73..500fbddbe7 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -192,6 +192,7 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); handle->tpl = subhandle; diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index a11fdf68b2..cd3e99ef81 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -111,6 +111,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); handle->tpl = subhandle; @@ -155,6 +156,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); @@ -390,6 +392,7 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for rocsparse CRS"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec); handle->tpl = subhandle; @@ -550,6 +553,8 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, MKLScalar* y_mkl = reinterpret_cast(y); if (handle->is_set_up) { subhandle = dynamic_cast(handle->tpl); + // note: classic mkl only runs on synchronous host exec spaces, so no need + // to call set_exec_space on the subhandle here if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL CRS"); @@ -710,6 +715,7 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for OneMKL CRS"); + subhandle->set_exec_space(exec); } else { subhandle = new OneMKL_SpMV_Data(exec); handle->tpl = subhandle; From 18a1119123262525870b10c044cbec7a4c3572ca Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 3 Apr 2024 12:39:58 -0600 Subject: [PATCH 213/326] Updates from feedback runnig Trilinos testing - Update debug level to > 1 guarding `printInformation(...)` in KokkosBlas1_axpby.hpp to reduce noisy test output - Loosen tolerance of lapack.svd test to avoid random failures that occur near prior tolerance level --- blas/src/KokkosBlas1_axpby.hpp | 2 +- lapack/unit_test/Test_Lapack_svd.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 0f95786dfc..5cd03dd7c7 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -73,7 +73,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // Perform compile time checks and run time checks. // ********************************************************************** AxpbyTraits::performChecks(a, X, b, Y); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if (KOKKOSKERNELS_DEBUG_LEVEL > 1) AxpbyTraits::printInformation(std::cout, "axpby(), unif information"); #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/lapack/unit_test/Test_Lapack_svd.hpp b/lapack/unit_test/Test_Lapack_svd.hpp index 6cf161fd3b..da9f9ba480 100644 --- a/lapack/unit_test/Test_Lapack_svd.hpp +++ b/lapack/unit_test/Test_Lapack_svd.hpp @@ -475,7 +475,7 @@ int impl_test_svd(const int m, const int n) { Kokkos::View; const mag_type max_val = 10; - const mag_type tol = 1000 * max_val * KAT_S::eps(); + const mag_type tol = 2000 * max_val * KAT_S::eps(); AMatrix A("A", m, n), U("U", m, m), Vt("Vt", n, n), Aref("A ref", m, n); vector_type S("S", Kokkos::min(m, n)); From fc280f0709776ec834f7ea38e2d2f72b87a80535 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 4 Apr 2024 11:45:37 -0600 Subject: [PATCH 214/326] Fix #2167: classic MKL doesn't use space instance (#2168) --- sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 6 ++++-- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 188bc5580d..7eb6307753 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -48,7 +48,8 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); - subhandle->set_exec_space(exec); + // note: classic mkl only runs on synchronous host exec spaces, so no need + // to call set_exec_space on the subhandle here } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. @@ -128,7 +129,8 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); - subhandle->set_exec_space(exec); + // note: classic mkl only runs on synchronous host exec spaces, so no need + // to call set_exec_space on the subhandle here } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index cd3e99ef81..1555050420 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -553,11 +553,11 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, MKLScalar* y_mkl = reinterpret_cast(y); if (handle->is_set_up) { subhandle = dynamic_cast(handle->tpl); - // note: classic mkl only runs on synchronous host exec spaces, so no need - // to call set_exec_space on the subhandle here if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL CRS"); + // note: classic mkl only runs on synchronous host exec spaces, so no need + // to call set_exec_space on the subhandle here } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. From 1ed57ec908f725ba390e0fb309ce1353f3dd3551 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 4 Apr 2024 14:18:43 -0600 Subject: [PATCH 215/326] CHANGELOG.md: 4.3.00 update --- CHANGELOG.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c35eda7d8..4e6da70740 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,99 @@ # Change Log +## [4.3.00](https://github.com/kokkos/kokkos-kernels/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.2.01...4.3.00) + +### New Features + +#### BLAS updates +- Syr2 [\#1942](https://github.com/kokkos/kokkos-kernels/pull/1942) + +#### LAPACK updates +- Adding cuSOLVER [\#2038](https://github.com/kokkos/kokkos-kernels/pull/2038) + - Fix for MAGMA with CUDA [\#2044](https://github.com/kokkos/kokkos-kernels/pull/2044) +- Adding rocSOLVER [\#2034](https://github.com/kokkos/kokkos-kernels/pull/2034) + - Fix rocSOLVER issue with Trilinos dependency [\#2037](https://github.com/kokkos/kokkos-kernels/pull/2037) +- Lapack - SVD [\#2092](https://github.com/kokkos/kokkos-kernels/pull/2092) + - Adding benchmark for SVD [\#2103](https://github.com/kokkos/kokkos-kernels/pull/2103) + - Quick return to fix cuSOLVER and improve performance [\#2107](https://github.com/kokkos/kokkos-kernels/pull/2107) + - Fix Intel MKL tolerance for SVD tests [\#2110](https://github.com/kokkos/kokkos-kernels/pull/2110) + +#### Sparse updates +- Add block support to all SPILUK algorithms [\#2064](https://github.com/kokkos/kokkos-kernels/pull/2064) + - Block spiluk follow up [\#2085](https://github.com/kokkos/kokkos-kernels/pull/2085) + - Make spiluk_handle::reset backwards compatible [\#2087](https://github.com/kokkos/kokkos-kernels/pull/2087) +- Sptrsv improvements + - Add sptrsv execution space overloads [\#1982](https://github.com/kokkos/kokkos-kernels/pull/1982) + - Refactor Test_Sparse_sptrsv [\#2102](https://github.com/kokkos/kokkos-kernels/pull/2102) + - Add support for BSR matrices to some trsv routines [\#2104](https://github.com/kokkos/kokkos-kernels/pull/2104) +- GMRES: Add support for BSR matrices [\#2097](https://github.com/kokkos/kokkos-kernels/pull/2097) +- Spmv handle [\#2126](https://github.com/kokkos/kokkos-kernels/pull/2126) +- Option to apply RCM reordering to extracted CRS diagonal blocks [\#2125](https://github.com/kokkos/kokkos-kernels/pull/2125) + +#### ODE updates +- Adding adaptive BDF methods [\#1930](https://github.com/kokkos/kokkos-kernels/pull/1930) + +#### Misc updates +- Add HIPManagedSpace support [\#2079](https://github.com/kokkos/kokkos-kernels/pull/2079) + +### Enhancements: + +#### BLAS +- Axpby: improvement on unification attempt logic and on the execution of a diversity of situations [\#1895](https://github.com/kokkos/kokkos-kernels/pull/1895) + +#### Misc updates +- Use execution space operator== [\#2136](https://github.com/kokkos/kokkos-kernels/pull/2136) + +#### TPL support +- Add TPL support for KokkosBlas::dot [\#1949](https://github.com/kokkos/kokkos-kernels/pull/1949) +- Add CUDA/HIP TPL support for KokkosSparse::spadd [\#1962](https://github.com/kokkos/kokkos-kernels/pull/1962) +- Don't call optimize_gemv for one-shot MKL spmv [\#2073](https://github.com/kokkos/kokkos-kernels/pull/2073) +- Async matrix release for MKL >= 2023.2 in SpMV [\#2074](https://github.com/kokkos/kokkos-kernels/pull/2074) +- BLAS - MKL: fixing HostBlas calls to handle MKL_INT type [\#2112](https://github.com/kokkos/kokkos-kernels/pull/2112) + +### Build System: +- Support CUBLAS_{LIBRARIES,LIBRARY_DIRS,INCLUDE_DIRS,ROOT} and KokkosKernels_CUBLAS_ROOT CMake options [\#2075](https://github.com/kokkos/kokkos-kernels/pull/2075) +- Link std::filesystem for IntelLLVM in perf_test/sparse [\#2055](https://github.com/kokkos/kokkos-kernels/pull/2055) +- Fix Cuda TPL finding [\#2098](https://github.com/kokkos/kokkos-kernels/pull/2098) +- CMake: error out in certain case [\#2115](https://github.com/kokkos/kokkos-kernels/pull/2115) + +### Documentation and Testing: +- par_ilut: Update documentation for fill_in_limit [\#2001](https://github.com/kokkos/kokkos-kernels/pull/2001) +- Wiki examples for BLAS2 functions are added [\#2122](https://github.com/kokkos/kokkos-kernels/pull/2122) +- github workflows: update to v4 (use Node 20) [\#2119](https://github.com/kokkos/kokkos-kernels/pull/2119) + +### Benchmarks: +- gemm3 perf test: user CUDA, SYCL, or HIP device for kokkos:initialize [\#2058](https://github.com/kokkos/kokkos-kernels/pull/2058) +- Lapack: adding svd benchmark [\#2103](https://github.com/kokkos/kokkos-kernels/pull/2103) +- Benchmark: modifying spmv benchmark to fix interface and run range of spmv tests [\#2135](https://github.com/kokkos/kokkos-kernels/pull/2135) + +### Cleanup: +- Experimental hip cleanup [\#1999](https://github.com/kokkos/kokkos-kernels/pull/1999) +- iostream clean-up in benchmarks [\#2004](https://github.com/kokkos/kokkos-kernels/pull/2004) +- Update: implicit capture of 'this' via '[=]' is deprecated in C++20 warnings [\#2076](https://github.com/kokkos/kokkos-kernels/pull/2076) +- Deprecate KOKKOSLINALG_OPT_LEVEL [\#2072](https://github.com/kokkos/kokkos-kernels/pull/2072) +- Remove all mentions of HBWSpace [\#2101](https://github.com/kokkos/kokkos-kernels/pull/2101) +- Change name of yaml-cpp to yamlcpp (trilinos/Trilinos#12710) [\#2099](https://github.com/kokkos/kokkos-kernels/pull/2099) +- Hands off namespace Kokkos::Impl - cleanup couple violations that snuck in [\#2094](https://github.com/kokkos/kokkos-kernels/pull/2094) +- Kokkos Kernels: update version guards to drop old version of Kokkos [\#2133](https://github.com/kokkos/kokkos-kernels/pull/2133) +- Sparse MKL: changing the location of the MKL_SAFE_CALL macro [\#2134](https://github.com/kokkos/kokkos-kernels/pull/2134) + +### Bug Fixes: +- Bspgemm cusparse hang [\#2008](https://github.com/kokkos/kokkos-kernels/pull/2008) +- bhalf_t fix for isnan function [\#2007](https://github.com/kokkos/kokkos-kernels/pull/2007) +- Fence Kokkos before timed iterations [\#2066](https://github.com/kokkos/kokkos-kernels/pull/2066) +- CUDA 11.2.1 / cuSPARSE 11.4.0 changed SpMV enums [\#2011](https://github.com/kokkos/kokkos-kernels/pull/2011) +- Fix the spadd API [\#2090](https://github.com/kokkos/kokkos-kernels/pull/2090) +- Axpby reduce deep copy calls [\#2081](https://github.com/kokkos/kokkos-kernels/pull/2081) +- Correcting BLAS test failures with cuda when ETI_ONLY = OFF (issue #2061) [\#2077](https://github.com/kokkos/kokkos-kernels/pull/2077) +- Fix weird Trilinos compiler error [\#2117](https://github.com/kokkos/kokkos-kernels/pull/2117) +- Fix for missing STL inclusion [\#2113](https://github.com/kokkos/kokkos-kernels/pull/2113) +- Fix build error in trsv on gcc8 [\#2111](https://github.com/kokkos/kokkos-kernels/pull/2111) +- Add a workaround for compilation errors with cuda-12.2.0 + gcc-12.3 [\#2108](https://github.com/kokkos/kokkos-kernels/pull/2108) +- Increase tolerance on gesv test (Fix #2123) [\#2124](https://github.com/kokkos/kokkos-kernels/pull/2124) +- Fix usage of RAII to set cusparse/rocsparse stream [\#2141](https://github.com/kokkos/kokkos-kernels/pull/2141) +- Spmv bsr matrix fix missing matrix descriptor (rocsparse) [\#2138](https://github.com/kokkos/kokkos-kernels/pull/2138) + ## [4.2.01](https://github.com/kokkos/kokkos-kernels/tree/4.2.01) (2024-01-17) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.2.00...4.2.01) @@ -403,6 +497,7 @@ ### Bug Fixes: +- Use CRS matrix sort, instead of Kokkos::sort on each row [\#1553](https://github.com/kokkos/kokkos-kernels/pull/1553) - Change template type for StaticCrsGraph in BsrMatrix [\#1531](https://github.com/kokkos/kokkos-kernels/pull/1531) - Remove listing of undefined TPL deps [\#1568](https://github.com/kokkos/kokkos-kernels/pull/1568) - Fix using SpGEMM with nonstandard scalar type, with MKL enabled [\#1591](https://github.com/kokkos/kokkos-kernels/pull/1591) @@ -544,7 +639,7 @@ ## [3.6.00](https://github.com/kokkos/kokkos-kernels/tree/3.6.00) (2022-02-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.5.00...3.6.00) -### Features: +### Features: #### Batched Sparse Linear algebra - Kokkos Kernels is adding a new component to the library: batched sparse linear algebra. @@ -578,7 +673,7 @@ - SpMV: adding support for rocSPARSE TPL [\#1221](https://github.com/kokkos/kokkos-kernels/pull/1221) #### Additional new features -- bhalf: Unit test Batched GEMM [\#1251](https://github.com/kokkos/kokkos-kernels/pull/1251) +- bhalf: Unit test Batched GEMM [\#1251](https://github.com/kokkos/kokkos-kernels/pull/1251) - and demostrate GMRES example convergence with bhalf_t (https://github.com/kokkos/kokkos-kernels/pull/1300) - Stream interface: adding stream support in GEMV and GEMM [\#1131](https://github.com/kokkos/kokkos-kernels/pull/1131) - Improve double buffering batched gemm performance [\#1217](https://github.com/kokkos/kokkos-kernels/pull/1217) @@ -864,6 +959,13 @@ - Nightly test failure: spgemm unit tests failing on White \(Power8\) [\#780](https://github.com/kokkos/kokkos-kernels/issues/780) - supernodal does not build with UVM enabled [\#633](https://github.com/kokkos/kokkos-kernels/issues/633) +## [3.1.01](https://github.com/kokkos/kokkos-kernels/tree/3.1.01) (2020-05-04) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.00...3.1.01) + +** Fixed bugs:** + +- KokkosBatched QR PR breaking nightly tests [\#691](https://github.com/kokkos/kokkos-kernels/issues/691) + ## [3.1.00](https://github.com/kokkos/kokkos-kernels/tree/3.1.00) (2020-04-14) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.0.00...3.1.00) From 5bf5474dcc02d7c9cd25e9c9adb377c7c62a49fc Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 5 Apr 2024 13:06:34 -0600 Subject: [PATCH 216/326] KokkosLapack_svd_tpl_spec_decl: defer to MKL spec when LAPACK also enabled Resolves redefintion of struct SVD compilation errors with both MKL and LAPACK are enabled Reported by @maartenarnst in https://github.com/trilinos/Trilinos/issues/12891 Co-authored-by: brian-kelley --- lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp index bc23068c57..4385fa40d6 100644 --- a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp @@ -41,7 +41,8 @@ inline void svd_print_specialization() { } // namespace KokkosLapack // LAPACK -#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) #include "KokkosLapack_Host_tpl.hpp" namespace KokkosLapack { From 2abaf6d527e1f88cf94e857d2a3b07dfd30dff67 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 8 Apr 2024 07:39:23 -0600 Subject: [PATCH 217/326] .github/workflows: Added bdw.yml --- .github/workflows/bdw.yml | 353 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 .github/workflows/bdw.yml diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml new file mode 100644 index 0000000000..10c30c118d --- /dev/null +++ b/.github/workflows/bdw.yml @@ -0,0 +1,353 @@ +name: github-BDW + +# Only allow manual runs until at2 runners are available. +on: workflow_dispatch + #pull_request: + # paths-ignore: + # - '**/*.rst' + # - '**/*.md' + # - '**/requirements.txt' + # - '**/*.py' + # - 'docs/**' + # types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI: + name: PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI + runs-on: [kk-env-gcc-10.2.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_CXX_FLAGS=-O3 \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ARCH_BDW=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DCMAKE_CXX_COMPILER=g++ \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 " \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_TEST_ETI_ONLY=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + PR_BDW_GNU1020_THREADS_SERIAL_RIGHT_REL: + name: PR_BDW_GNU1020_THREADS_SERIAL_RIGHT_REL + runs-on: [kk-env-gcc-10.2.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_CXX_FLAGS=-O3 \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_SERIAL=ON \ + -DKokkos_ENABLE_THREADS=ON \ + -DKokkos_ARCH_BDW=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DCMAKE_CXX_COMPILER=g++ \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 " \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL: + name: PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL + runs-on: [kk-env-openblas-0.3.21-gcc-10.2.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + + cmake \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_CXX_FLAGS=-O3 \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_SERIAL=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ARCH_BDW=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + + cmake \ + -DCMAKE_CXX_COMPILER=g++ \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 " \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_BLAS=ON \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL: + name: PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL + runs-on: [kk-env-llvm-10.0.1-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_SERIAL=ON \ + -DKokkos_ENABLE_THREADS=ON \ + -DKokkos_ARCH_BDW=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + \ No newline at end of file From 0cadec78e6b1959bc1ee29c6db5b343b4287ec22 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 8 Apr 2024 07:40:22 -0600 Subject: [PATCH 218/326] .github/workflows: Added spr.yml --- .github/workflows/spr.yml | 60 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 .github/workflows/spr.yml diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml new file mode 100644 index 0000000000..efb585ef02 --- /dev/null +++ b/.github/workflows/spr.yml @@ -0,0 +1,60 @@ +name: github-SPR + +# Only allow manual runs until at2 runners are available. +on: workflow_dispatch + #pull_request: + # paths-ignore: + # - '**/*.rst' + # - '**/*.md' + # - '**/requirements.txt' + # - '**/*.py' + # - 'docs/**' + # types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + PR_SPR_ONEAPI202310_OPENMP_LEFT_MKLBLAS_MKLLAPACK_REL: + name: PR_SPR_ONEAPI202310_OPENMP_LEFT_MKLBLAS_MKLLAPACK_REL + runs-on: [kk-env-intel-oneapi-compilers-2023.1.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v4 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v4 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure + run: | + mkdir -p build + cd build + ../kokkos-kernels/cm_generate_makefile.bash \ + --with-openmp \ + --with-serial \ + --arch=SPR \ + --compiler=icpx \ + --cxxflags="-fp-model=precise" \ + --with-tpls=mkl \ + --kokkos-cmake-flags=-DKokkos_ENABLE_ONEDPL=OFF \ + --kokkos-path=$PWD/../kokkos + + - name: build + working-directory: build + run: make -j16 + + - name: test + working-directory: build + run: ctest --output-on-failure -V --timeout 3600 \ No newline at end of file From db41fd7f2107e95a8a0a24cec471df0f93f96da5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 8 Apr 2024 07:40:57 -0600 Subject: [PATCH 219/326] .github/workflows: Added mi210.yml --- .github/workflows/mi210.yml | 103 ++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 .github/workflows/mi210.yml diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml new file mode 100644 index 0000000000..d2c3a16e1c --- /dev/null +++ b/.github/workflows/mi210.yml @@ -0,0 +1,103 @@ +name: github-MI210 + +# Only allow manual runs until at2 runners are available. +on: workflow_dispatch + #pull_request: + # paths-ignore: + # - '**/*.rst' + # - '**/*.md' + # - '**/requirements.txt' + # - '**/*.py' + # - 'docs/**' + # types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL: + name: PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL + runs-on: [kk-env-hip-5.6.1-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v4 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v4 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + HIPCC=$(which hipcc) + cmake -DCMAKE_CXX_COMPILER=$HIPCC \ + -DCMAKE_CXX_FLAGS=-O3 \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_SERIAL=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ARCH_VEGA90A=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j16 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + HIPCC=$(which hipcc) + cmake -DCMAKE_CXX_COMPILER=$HIPCC \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 " \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build + working-directory: kokkos-kernels/build + run: make -j16 + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 \ No newline at end of file From f302dd7a728ccf75614b5b13eb5ed4a3767fb5dd Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 8 Apr 2024 07:41:51 -0600 Subject: [PATCH 220/326] .github/workflows: Added h100.yml --- .github/workflows/h100.yml | 96 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 .github/workflows/h100.yml diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml new file mode 100644 index 0000000000..279b6089fa --- /dev/null +++ b/.github/workflows/h100.yml @@ -0,0 +1,96 @@ +name: github-H100 + +# Only allow manual runs until at2 runners are available. +on: workflow_dispatch + #pull_request: + # paths-ignore: + # - '**/*.rst' + # - '**/*.md' + # - '**/requirements.txt' + # - '**/*.py' + # - 'docs/**' + # types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL_2: + name: PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL_2 + runs-on: [kk-env-cuda-11.8.0-gcc-11.3.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + nvidia-smi + cd kokkos/build + cmake -DCMAKE_CXX_COMPILER=$PWD/../bin/nvcc_wrapper \ + -DCMAKE_CXX_FLAGS= \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ARCH_HOPPER90=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DCMAKE_CXX_COMPILER=$PWD/../../kokkos/bin/nvcc_wrapper \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="" \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT=ON \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 \ No newline at end of file From c9ab634b29cf8c366fc41dbabeac43fc972302c5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 8 Apr 2024 07:42:38 -0600 Subject: [PATCH 221/326] .github/workflows: Added volta70.yml --- .github/workflows/volta70.yml | 180 ++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 .github/workflows/volta70.yml diff --git a/.github/workflows/volta70.yml b/.github/workflows/volta70.yml new file mode 100644 index 0000000000..342121781e --- /dev/null +++ b/.github/workflows/volta70.yml @@ -0,0 +1,180 @@ +name: github-VOLTA70 + +# Only allow manual runs until at2 runners are available. +on: workflow_dispatch + #pull_request: + # paths-ignore: + # - '**/*.rst' + # - '**/*.md' + # - '**/requirements.txt' + # - '**/*.py' + # - 'docs/**' + # types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + PR_VOLTA70_CUDA1122_CUDA_LEFT_RIGHT_REL: + name: PR_VOLTA70_CUDA1122_CUDA_LEFT_RIGHT_REL + runs-on: [kk-env-cuda-11.2.2-gcc-8.4.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake -DCMAKE_CXX_COMPILER=$PWD/../bin/nvcc_wrapper \ + -DCMAKE_CXX_FLAGS= \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ARCH_HOPPER90=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DCMAKE_CXX_COMPILER=$PWD/../../kokkos/bin/nvcc_wrapper \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="" \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT=ON \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_OPENLAPACK_REL: + name: PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_OPENLAPACK_REL + runs-on: [kk-env-cuda-10.1.243-openblas-0.3.20-llvm-13.0.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + NVCC=$(which clang++) + cmake -DCMAKE_CXX_COMPILER=$NVCC \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ARCH_POWER9=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + NVCC=$(which clang++) + cmake \ + -DCMAKE_CXX_COMPILER=$NVCC \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_BLAS=ON \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=ON \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=ON \ + -DCMAKE_EXE_LINKER_FLAGS="-lgfortran -lm" \ + -DLAPACK_LIBRARIES=lapack \ + -DBLAS_LIBRARIES=blas \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 \ No newline at end of file From 9baac22ffb80960755b5dbd5879ba19435b7bd78 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 8 Apr 2024 09:16:35 -0600 Subject: [PATCH 222/326] .github/workflows: Added power9.yml and power9_tpls.yml --- .github/workflows/power9.yml | 118 ++++++++++++++++++++++++++++++ .github/workflows/power9_tpls.yml | 106 +++++++++++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 .github/workflows/power9.yml create mode 100644 .github/workflows/power9_tpls.yml diff --git a/.github/workflows/power9.yml b/.github/workflows/power9.yml new file mode 100644 index 0000000000..57cbf751bd --- /dev/null +++ b/.github/workflows/power9.yml @@ -0,0 +1,118 @@ +name: github-POWER9 + +# Only allow manual runs until at2 runners are available. +on: workflow_dispatch + #pull_request: + # paths-ignore: + # - '**/*.rst' + # - '**/*.md' + # - '**/requirements.txt' + # - '**/*.py' + # - 'docs/**' + # types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_REL: + name: PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_REL + runs-on: [kk-env-gcc-9.3.0-latest] + + strategy: + matrix: + include: + - backend: "SERIAL" + serial: "ON" + openmp: "OFF" + - backend: "OPENMP" + serial: "OFF" + openmp: "ON" + - backend: "SERIAL_OPENMP" + serial: "ON" + openmp: "ON" + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build-${{ matrix.backend }},install-${{ matrix.backend }}} + cd kokkos/build-${{ matrix.backend }} + GPP=$(which g++) + cmake -DCMAKE_CXX_COMPILER=$GPP \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install-${{ matrix.backend }} \ + -DKokkos_ENABLE_SERIAL=${{ matrix.serial }} \ + -DKokkos_ENABLE_OPENMP=${{ matrix.openmp }} \ + -DKokkos_ARCH_PASCAL60=ON \ + -DKokkos_ARCH_POWER8=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install}-${{ matrix.backend }} + cd kokkos-kernels/build-${{ matrix.backend }} + GPP=$(which g++) + cmake \ + -DCMAKE_CXX_COMPILER=$GPP \ + -DKokkos_DIR=$PWD/../../kokkos/install-${{ matrix.backend }}/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_BLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 diff --git a/.github/workflows/power9_tpls.yml b/.github/workflows/power9_tpls.yml new file mode 100644 index 0000000000..3915af7bdf --- /dev/null +++ b/.github/workflows/power9_tpls.yml @@ -0,0 +1,106 @@ +name: github-POWER9-TPLS + +# Only allow manual runs until at2 runners are available. +on: workflow_dispatch + #pull_request: + # paths-ignore: + # - '**/*.rst' + # - '**/*.md' + # - '**/requirements.txt' + # - '**/*.py' + # - 'docs/**' + # types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL: + name: PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL + runs-on: [kk-env-openblas-0.3.20-gcc-9.3.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + GPP=$(which g++) + cmake -DCMAKE_CXX_COMPILER=$GPP \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ARCH_POWER9=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + GPP=$(which g++) + cmake \ + -DCMAKE_CXX_COMPILER=$GPP \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_BLAS=ON \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="-lgfortran -lm" \ + -DLAPACK_LIBRARIES=lapack \ + -DBLAS_LIBRARIES=blas \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 \ No newline at end of file From 4f3220c911c0a3ef15fe22c7e2949b9f5f0ee720 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 9 Apr 2024 14:10:35 -0600 Subject: [PATCH 223/326] Jgfouca/block spiluk fixes (#2172) * Progress * Attempt 1, fix multiplication order * Converges in 1 step * Various cleanups * Be sure not to reduce performance of unblocked impl Also add some comments. * Remove test mangling * Fixes for GPU * Fix warning * formatting * Increase eps for floats * This is no longer needed --- .../impl/KokkosBatched_Trsm_Team_Impl.hpp | 46 +++- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 260 ++++++++++++++---- sparse/unit_test/Test_Sparse_spiluk.hpp | 12 +- 3 files changed, 248 insertions(+), 70 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index a7430775ea..9f5f857e44 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -99,6 +99,48 @@ struct TeamTrsm +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftUpper::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + +template +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftUpper::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + +/// +/// R/U/T +/// +/// B := (alpha*B) inv(triu(A)) +/// A(n x n), B(m x n) + template struct TeamTrsm { @@ -107,7 +149,7 @@ struct TeamTrsm::invoke( + return TeamTrsmInternalLeftUpper::invoke( member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); @@ -122,7 +164,7 @@ struct TeamTrsm::invoke( + return TeamTrsmInternalLeftUpper::invoke( member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index b3b5dfa277..415ccf87a0 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -32,6 +32,9 @@ #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBlas1_set.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_Trmm_Decl.hpp" +#include "KokkosBatched_Trmm_Serial_Impl.hpp" //#define NUMERIC_OUTPUT_INFO @@ -107,6 +110,17 @@ struct IlukWrap { lno_t lev_start; using reftype = scalar_t &; + using valtype = scalar_t; + + static constexpr size_type BUFF_SIZE = 1; + + struct SBlock { + template + KOKKOS_INLINE_FUNCTION SBlock(T, size_type, size_type) {} + + KOKKOS_INLINE_FUNCTION + scalar_t *data() { return nullptr; } + }; Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, @@ -131,6 +145,9 @@ struct IlukWrap { "Tried to use blocks with the unblocked Common?"); } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return 0; } + // lset KOKKOS_INLINE_FUNCTION void lset(const size_type nnz, const scalar_t &value) const { @@ -154,12 +171,18 @@ struct IlukWrap { // divide. lhs /= rhs KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, scalar_t &lhs, - const scalar_t &rhs) const { + void divide(const member_type &team, scalar_t &lhs, const scalar_t &rhs, + scalar_t *) const { Kokkos::single(Kokkos::PerTeam(team), [&]() { lhs /= rhs; }); team.team_barrier(); } + // divide_left. lhs /= rhs + KOKKOS_INLINE_FUNCTION + void divide_left(scalar_t &lhs, const scalar_t &rhs, scalar_t *) const { + lhs /= rhs; + } + // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION void multiply_subtract(const scalar_t &A, const scalar_t &B, @@ -171,6 +194,18 @@ struct IlukWrap { KOKKOS_INLINE_FUNCTION scalar_t &lget(const size_type nnz) const { return L_values(nnz); } + // lcopy + KOKKOS_INLINE_FUNCTION + scalar_t lcopy(const size_type nnz, scalar_t *) const { + return L_values(nnz); + } + + // ucopy + KOKKOS_INLINE_FUNCTION + scalar_t ucopy(const size_type nnz, scalar_t *) const { + return U_values(nnz); + } + // uget KOKKOS_INLINE_FUNCTION scalar_t &uget(const size_type nnz) const { return U_values(nnz); } @@ -188,6 +223,12 @@ struct IlukWrap { // print KOKKOS_INLINE_FUNCTION void print(const scalar_t &item) const { std::cout << item << std::endl; } + + // report + KOKKOS_INLINE_FUNCTION + void report() const { + std::cout << "JGF using unblocked version" << std::endl; + } }; // Partial specialization for block support @@ -197,6 +238,30 @@ struct IlukWrap { struct Common { + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + using value_type = typename LValuesType::value_type; + using cvalue_type = typename LValuesType::const_value_type; + + using Block = Kokkos::View< + value_type **, Layout, typename LValuesType::device_type, + Kokkos::MemoryTraits >; + + // const block + using CBlock = Kokkos::View< + cvalue_type **, Layout, typename UValuesType::device_type, + Kokkos::MemoryTraits >; + + // scratch block + using SBlock = Kokkos::View< + value_type **, Layout, typename execution_space::scratch_memory_space, + Kokkos::MemoryTraits >; + + using reftype = Block; + using valtype = Block; + + static constexpr size_type BUFF_SIZE = 128; + ARowMapType A_row_map; AEntriesType A_entries; AValuesType A_values; @@ -212,26 +277,6 @@ struct IlukWrap { size_type block_size; size_type block_items; - // BSR data is in LayoutRight! - using Layout = Kokkos::LayoutRight; - - using LBlock = Kokkos::View< - typename LValuesType::value_type **, Layout, - typename LValuesType::device_type, - Kokkos::MemoryTraits >; - - using UBlock = Kokkos::View< - typename UValuesType::value_type **, Layout, - typename UValuesType::device_type, - Kokkos::MemoryTraits >; - - using ABlock = Kokkos::View< - typename AValuesType::value_type **, Layout, - typename AValuesType::device_type, - Kokkos::MemoryTraits >; - - using reftype = LBlock; - Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, @@ -255,8 +300,12 @@ struct IlukWrap { block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + // lset KOKKOS_INLINE_FUNCTION void lset(const size_type block, const scalar_t &value) const { @@ -264,13 +313,9 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void lset(const size_type block, const ABlock &rhs) const { + void lset(const size_type block, const CBlock &rhs) const { auto lblock = lget(block); - for (size_type i = 0; i < block_size; ++i) { - for (size_type j = 0; j < block_size; ++j) { - lblock(i, j) = rhs(i, j); - } - } + assign(lblock, rhs); } // uset @@ -280,13 +325,9 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void uset(const size_type block, const ABlock &rhs) const { + void uset(const size_type block, const CBlock &rhs) const { auto ublock = uget(block); - for (size_type i = 0; i < block_size; ++i) { - for (size_type j = 0; j < block_size; ++j) { - ublock(i, j) = rhs(i, j); - } - } + assign(ublock, rhs); } // lset_id @@ -295,49 +336,111 @@ struct IlukWrap { KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); } - // divide. lhs /= rhs + // assign + template + KOKKOS_INLINE_FUNCTION void assign(const ViewT &lhs, + const CBlock &rhs) const { + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + lhs(i, j) = rhs(i, j); + } + } + } + + // divide. lhs /= rhs (lhs = lhs * rhs^-1) KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, const LBlock &lhs, - const UBlock &rhs) const { + void divide(const member_type &team, const Block &lhs, const CBlock &rhs, + scalar_t *buff) const { + // Need a temp block to do LU of rhs + Block LU(buff, block_size, block_size); + assign(LU, rhs); + KokkosBatched::TeamLU::invoke(team, LU); + + // rhs = LU + // rhs^-1 = U^-1 * L^-1 + // lhs = (lhs * U^-1) * L^-1, so do U trsm first KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, // not 100% on this - KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>:: // not 100% on this - invoke(team, 1.0, rhs, lhs); + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + } + + // divide_left. lhs /= rhs (lhs = rhs^-1 * lhs) + KOKKOS_INLINE_FUNCTION + void divide_left(const Block &lhs, const CBlock &rhs, + scalar_t *buff) const { + Block LU(buff, block_size, block_size); + assign(LU, rhs); + KokkosBatched::SerialLU::invoke(LU); + + // rhs = LU + // rhs^-1 = U^-1 * L^-1 + // lhs = U^-1 * (L^-1 * lhs), so do L trsm first + KokkosBatched::SerialTrsm< + KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); + + KokkosBatched::SerialTrsm< + KokkosBatched::Side::Left, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); } // multiply_subtract. C -= A * B - template - KOKKOS_INLINE_FUNCTION void multiply_subtract(const UBlock &A, - const LBlock &B, - CView &C) const { + KOKKOS_INLINE_FUNCTION + void multiply_subtract(const CBlock &A, const CBlock &B, + const Block &C) const { // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 KokkosBatched::SerialGemm< KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Unblocked>::invoke( - -1.0, A, B, 1.0, C); + KokkosBatched::Algo::Gemm::Blocked>::invoke(-1.0, A, B, 1.0, + C); } // lget KOKKOS_INLINE_FUNCTION - LBlock lget(const size_type block) const { - return LBlock(L_values.data() + (block * block_items), block_size, - block_size); + Block lget(const size_type block) const { + return Block(L_values.data() + (block * block_items), block_size, + block_size); + } + + // lcopy + KOKKOS_INLINE_FUNCTION + Block lcopy(const size_type block, scalar_t *buff) const { + Block result(buff, block_size, block_size); + auto lblock = lget(block); + assign(result, lblock); + return result; + } + + // ucopy + KOKKOS_INLINE_FUNCTION + Block ucopy(const size_type block, scalar_t *buff) const { + Block result(buff, block_size, block_size); + auto ublock = uget(block); + assign(result, ublock); + return result; } // uget KOKKOS_INLINE_FUNCTION - UBlock uget(const size_type block) const { - return UBlock(U_values.data() + (block * block_items), block_size, - block_size); + Block uget(const size_type block) const { + return Block(U_values.data() + (block * block_items), block_size, + block_size); } // aget KOKKOS_INLINE_FUNCTION - ABlock aget(const size_type block) const { - return ABlock(A_values.data() + (block * block_items), block_size, + CBlock aget(const size_type block) const { + return CBlock(A_values.data() + (block * block_items), block_size, block_size); } @@ -357,7 +460,7 @@ struct IlukWrap { // print KOKKOS_INLINE_FUNCTION - void print(const LBlock &item) const { + void print(const CBlock &item) const { for (size_type i = 0; i < block_size; ++i) { std::cout << " "; for (size_type j = 0; j < block_size; ++j) { @@ -366,6 +469,13 @@ struct IlukWrap { std::cout << std::endl; } } + + // report + KOKKOS_INLINE_FUNCTION + void report() const { + std::cout << "JGF using blocked version with block_size=" << block_size + << std::endl; + } }; template struct SpilukTest { @@ -130,6 +128,7 @@ struct SpilukTest { using EntriesType = Kokkos::View; using ValuesType = Kokkos::View; using AT = Kokkos::ArithTraits; + using mag_t = typename Kokkos::ArithTraits::mag_type; using RowMapType_hostmirror = typename RowMapType::HostMirror; using EntriesType_hostmirror = typename EntriesType::HostMirror; @@ -138,6 +137,9 @@ struct SpilukTest { using memory_space = typename device::memory_space; using range_policy = Kokkos::RangePolicy; + static constexpr double EPS = + std::is_same::value ? 1e-7 : 1e-4; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; @@ -243,11 +245,7 @@ struct SpilukTest { } if (fill_lev > 1) { - if (UseBlocks) { - EXPECT_LT(result, 1e-2); - } else { - EXPECT_LT(result, 1e-4); - } + EXPECT_LT(result, 1e-4); } } From a86c780243b5f53597d3e44cd3f27e10cb643830 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Apr 2024 15:41:31 -0600 Subject: [PATCH 224/326] .github/workflows: Add PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL --- .github/workflows/mi210.yml | 85 ++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index d2c3a16e1c..60a7549814 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -96,7 +96,90 @@ jobs: - name: build working-directory: kokkos-kernels/build - run: make -j16 + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL: + name: PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL + runs-on: [kk-env-openblas-0.3.23-hip-5.6.1-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v4 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v4 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + HIPCC=$(which hipcc) + cmake -DCMAKE_CXX_COMPILER=$HIPCC \ + -DCMAKE_CXX_FLAGS=-O3 \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_SERIAL=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ARCH_VEGA90A=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j16 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + HIPCC=$(which hipcc) + cmake -DCMAKE_CXX_COMPILER=$HIPCC \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 " \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCSOLVER=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=ON \ + -DKokkosKernels_ENABLE_TPL_BLAS=ON \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build + working-directory: kokkos-kernels/build + run: make -j12 all - name: test working-directory: kokkos-kernels/build From 53ac9d49ce5c8ca22017a6ea69fcbe142df5ba79 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 12 Apr 2024 13:32:57 -0600 Subject: [PATCH 225/326] Add guard for cusparse spmv_mv_tpl_spec_avail Address issue #2175 Configuring with magma tpl enabled and cusparse disabled mistakenly triggers the cusparse tpl avail check to be true Guard the KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE macros when CUSPARSE is enabled to prevent this --- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 88fef4421a..44a8098ca3 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -29,6 +29,7 @@ struct spmv_mv_tpl_spec_avail { enum : bool { value = false }; }; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ XL, YL, MEMSPACE) \ template <> \ @@ -152,6 +153,7 @@ KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, #endif #endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) +#endif } // namespace Impl } // namespace KokkosSparse From 8029cc5739bfe46682f43b377a09c32b7a78f592 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 15 Apr 2024 10:18:10 -0600 Subject: [PATCH 226/326] .github/workflows: Remove OPENLAPACK from names --- .github/workflows/bdw.yml | 4 ++-- .github/workflows/mi210.yml | 4 ++-- .github/workflows/power9_tpls.yml | 4 ++-- .github/workflows/volta70.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index 10c30c118d..f5f4f9700e 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -183,8 +183,8 @@ jobs: working-directory: kokkos-kernels/build run: ctest --output-on-failure -V --timeout 3600 - PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL: - name: PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL + PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_REL: + name: PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_REL runs-on: [kk-env-openblas-0.3.21-gcc-10.2.0-latest] steps: diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index 60a7549814..e1f28f7ece 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -102,8 +102,8 @@ jobs: working-directory: kokkos-kernels/build run: ctest --output-on-failure -V --timeout 3600 - PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL: - name: PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL + PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_REL: + name: PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_REL runs-on: [kk-env-openblas-0.3.23-hip-5.6.1-latest] steps: diff --git a/.github/workflows/power9_tpls.yml b/.github/workflows/power9_tpls.yml index 3915af7bdf..73d6d0c57a 100644 --- a/.github/workflows/power9_tpls.yml +++ b/.github/workflows/power9_tpls.yml @@ -20,8 +20,8 @@ concurrency: cancel-in-progress: true jobs: - PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL: - name: PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL + PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_REL: + name: PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_REL runs-on: [kk-env-openblas-0.3.20-gcc-9.3.0-latest] steps: diff --git a/.github/workflows/volta70.yml b/.github/workflows/volta70.yml index 342121781e..742f1bf2f6 100644 --- a/.github/workflows/volta70.yml +++ b/.github/workflows/volta70.yml @@ -94,8 +94,8 @@ jobs: working-directory: kokkos-kernels/build run: ctest --output-on-failure -V --timeout 3600 - PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_OPENLAPACK_REL: - name: PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_OPENLAPACK_REL + PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_REL: + name: PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_REL runs-on: [kk-env-cuda-10.1.243-openblas-0.3.20-llvm-13.0.0-latest] steps: From 75ee9a6ff0259f80e9b19280c5bc26db6d84c618 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Apr 2024 09:34:09 -0600 Subject: [PATCH 227/326] .github/workflows: Remove power9 until we have hardware to test it on --- .github/workflows/power9.yml | 118 ------------------------------ .github/workflows/power9_tpls.yml | 106 --------------------------- 2 files changed, 224 deletions(-) delete mode 100644 .github/workflows/power9.yml delete mode 100644 .github/workflows/power9_tpls.yml diff --git a/.github/workflows/power9.yml b/.github/workflows/power9.yml deleted file mode 100644 index 57cbf751bd..0000000000 --- a/.github/workflows/power9.yml +++ /dev/null @@ -1,118 +0,0 @@ -name: github-POWER9 - -# Only allow manual runs until at2 runners are available. -on: workflow_dispatch - #pull_request: - # paths-ignore: - # - '**/*.rst' - # - '**/*.md' - # - '**/requirements.txt' - # - '**/*.py' - # - 'docs/**' - # types: [ opened, reopened, synchronize ] - -permissions: - contents: none - -# Cancels any in progress 'workflow' associated with this PR -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_REL: - name: PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_REL - runs-on: [kk-env-gcc-9.3.0-latest] - - strategy: - matrix: - include: - - backend: "SERIAL" - serial: "ON" - openmp: "OFF" - - backend: "OPENMP" - serial: "OFF" - openmp: "ON" - - backend: "SERIAL_OPENMP" - serial: "ON" - openmp: "ON" - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v3 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v3 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build-${{ matrix.backend }},install-${{ matrix.backend }}} - cd kokkos/build-${{ matrix.backend }} - GPP=$(which g++) - cmake -DCMAKE_CXX_COMPILER=$GPP \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install-${{ matrix.backend }} \ - -DKokkos_ENABLE_SERIAL=${{ matrix.serial }} \ - -DKokkos_ENABLE_OPENMP=${{ matrix.openmp }} \ - -DKokkos_ARCH_PASCAL60=ON \ - -DKokkos_ARCH_POWER8=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j12 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install}-${{ matrix.backend }} - cd kokkos-kernels/build-${{ matrix.backend }} - GPP=$(which g++) - cmake \ - -DCMAKE_CXX_COMPILER=$GPP \ - -DKokkos_DIR=$PWD/../../kokkos/install-${{ matrix.backend }}/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_BLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build_kokkos_kernels - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 diff --git a/.github/workflows/power9_tpls.yml b/.github/workflows/power9_tpls.yml deleted file mode 100644 index 73d6d0c57a..0000000000 --- a/.github/workflows/power9_tpls.yml +++ /dev/null @@ -1,106 +0,0 @@ -name: github-POWER9-TPLS - -# Only allow manual runs until at2 runners are available. -on: workflow_dispatch - #pull_request: - # paths-ignore: - # - '**/*.rst' - # - '**/*.md' - # - '**/requirements.txt' - # - '**/*.py' - # - 'docs/**' - # types: [ opened, reopened, synchronize ] - -permissions: - contents: none - -# Cancels any in progress 'workflow' associated with this PR -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_REL: - name: PR_POWER9_GCC930_OPENMP_SERIAL_LEFT_OPENBLAS_REL - runs-on: [kk-env-openblas-0.3.20-gcc-9.3.0-latest] - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v3 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v3 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build,install} - cd kokkos/build - GPP=$(which g++) - cmake -DCMAKE_CXX_COMPILER=$GPP \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ARCH_VOLTA70=ON \ - -DKokkos_ARCH_POWER9=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j12 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install} - cd kokkos-kernels/build - GPP=$(which g++) - cmake \ - -DCMAKE_CXX_COMPILER=$GPP \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_BLAS=ON \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DCMAKE_EXE_LINKER_FLAGS="-lgfortran -lm" \ - -DLAPACK_LIBRARIES=lapack \ - -DBLAS_LIBRARIES=blas \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build_kokkos_kernels - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 \ No newline at end of file From c8e5b508f3ec3593d5ac0555ad8160c47fb78b14 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Apr 2024 11:31:41 -0600 Subject: [PATCH 228/326] .github/workflows: Enable rocblas in rocm tpl check --- .github/workflows/mi210.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index e1f28f7ece..a52e892ce7 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -170,6 +170,7 @@ jobs: -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ -DKokkosKernels_ENABLE_TPL_ROCSOLVER=ON \ -DKokkosKernels_ENABLE_TPL_ROCSPARSE=ON \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=ON \ -DKokkosKernels_ENABLE_TPL_BLAS=ON \ -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ -DCMAKE_EXE_LINKER_FLAGS="" \ From 8769084fad59806905068b07083332578ac91a5b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Apr 2024 11:32:30 -0600 Subject: [PATCH 229/326] .github/workflows: Remove volta70 until we have hardware to test it on --- .github/workflows/volta70.yml | 180 ---------------------------------- 1 file changed, 180 deletions(-) delete mode 100644 .github/workflows/volta70.yml diff --git a/.github/workflows/volta70.yml b/.github/workflows/volta70.yml deleted file mode 100644 index 742f1bf2f6..0000000000 --- a/.github/workflows/volta70.yml +++ /dev/null @@ -1,180 +0,0 @@ -name: github-VOLTA70 - -# Only allow manual runs until at2 runners are available. -on: workflow_dispatch - #pull_request: - # paths-ignore: - # - '**/*.rst' - # - '**/*.md' - # - '**/requirements.txt' - # - '**/*.py' - # - 'docs/**' - # types: [ opened, reopened, synchronize ] - -permissions: - contents: none - -# Cancels any in progress 'workflow' associated with this PR -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - PR_VOLTA70_CUDA1122_CUDA_LEFT_RIGHT_REL: - name: PR_VOLTA70_CUDA1122_CUDA_LEFT_RIGHT_REL - runs-on: [kk-env-cuda-11.2.2-gcc-8.4.0-latest] - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v3 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v3 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build,install} - cd kokkos/build - cmake -DCMAKE_CXX_COMPILER=$PWD/../bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS= \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ARCH_HOPPER90=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j12 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install} - cd kokkos-kernels/build - cmake \ - -DCMAKE_CXX_COMPILER=$PWD/../../kokkos/bin/nvcc_wrapper \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="" \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DKokkosKernels_INST_LAYOUTRIGHT=ON \ - -DCMAKE_EXE_LINKER_FLAGS="" \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build_kokkos_kernels - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 - - PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_REL: - name: PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_REL - runs-on: [kk-env-cuda-10.1.243-openblas-0.3.20-llvm-13.0.0-latest] - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v3 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v3 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build,install} - cd kokkos/build - NVCC=$(which clang++) - cmake -DCMAKE_CXX_COMPILER=$NVCC \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ARCH_VOLTA70=ON \ - -DKokkos_ARCH_POWER9=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j12 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install} - cd kokkos-kernels/build - NVCC=$(which clang++) - cmake \ - -DCMAKE_CXX_COMPILER=$NVCC \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_BLAS=ON \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=ON \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=ON \ - -DCMAKE_EXE_LINKER_FLAGS="-lgfortran -lm" \ - -DLAPACK_LIBRARIES=lapack \ - -DBLAS_LIBRARIES=blas \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build_kokkos_kernels - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 \ No newline at end of file From 35e115a847eae8b9a5a8e5baa9f1470755311db4 Mon Sep 17 00:00:00 2001 From: malphil Date: Wed, 17 Apr 2024 14:14:14 -0600 Subject: [PATCH 230/326] Add early return if numRows == 0 in trsv to avoid integer divide-by-zero error --- sparse/impl/KokkosSparse_trsv_impl.hpp | 48 +++++++++++++++++++------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index 9adb029d12..6d04a42877 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -184,7 +184,9 @@ struct TrsvWrap { static void lowerTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; const lno_t numVecs = X.extent(1); @@ -211,7 +213,9 @@ struct TrsvWrap { static void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; const lno_t numVecs = X.extent(1); @@ -254,7 +258,9 @@ struct TrsvWrap { static void upperTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; const lno_t numVecs = X.extent(1); @@ -304,7 +310,9 @@ struct TrsvWrap { static void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; const lno_t numVecs = X.extent(1); @@ -371,7 +379,9 @@ struct TrsvWrap { static void upperTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; @@ -422,7 +432,9 @@ struct TrsvWrap { static void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; @@ -481,7 +493,9 @@ struct TrsvWrap { static void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; @@ -510,7 +524,9 @@ struct TrsvWrap { static void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; @@ -562,7 +578,9 @@ struct TrsvWrap { static void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; @@ -620,7 +638,9 @@ struct TrsvWrap { static void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; @@ -657,7 +677,9 @@ struct TrsvWrap { static void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; @@ -686,7 +708,9 @@ struct TrsvWrap { static void lowerTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { - const lno_t numRows = A.numRows(); + const lno_t numRows = A.numRows(); + if (numRows == 0) return; + const lno_t numCols = A.numCols(); const lno_t numPointRows = A.numPointRows(); const lno_t block_size = numPointRows / numRows; From 47a184940c5e878a95c894e7838425880798d48b Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 18 Apr 2024 11:07:15 -0600 Subject: [PATCH 231/326] Resolves multiple definition of Magma and Cuda singletons (#2178) Address issue #2175 --- blas/tpls/KokkosBlas_Cuda_tpl.cpp | 1 + blas/tpls/KokkosBlas_Cuda_tpl.hpp | 22 ---------- blas/tpls/KokkosBlas_Magma_tpl.hpp | 41 +++++++++++++++++++ blas/tpls/KokkosBlas_magma.hpp | 37 +++++++++++++++++ blas/tpls/KokkosBlas_tpl_spec.hpp | 17 -------- lapack/tpls/KokkosLapack_Cuda_tpl.cpp | 1 + lapack/tpls/KokkosLapack_Cuda_tpl.hpp | 22 ---------- lapack/tpls/KokkosLapack_Magma_tpl.hpp | 41 +++++++++++++++++++ .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 2 +- 9 files changed, 122 insertions(+), 62 deletions(-) create mode 100644 blas/tpls/KokkosBlas_Magma_tpl.hpp create mode 100644 blas/tpls/KokkosBlas_magma.hpp create mode 100644 lapack/tpls/KokkosLapack_Magma_tpl.hpp diff --git a/blas/tpls/KokkosBlas_Cuda_tpl.cpp b/blas/tpls/KokkosBlas_Cuda_tpl.cpp index eed90ef7e0..cb8ba34101 100644 --- a/blas/tpls/KokkosBlas_Cuda_tpl.cpp +++ b/blas/tpls/KokkosBlas_Cuda_tpl.cpp @@ -16,3 +16,4 @@ #include #include #include +#include diff --git a/blas/tpls/KokkosBlas_Cuda_tpl.hpp b/blas/tpls/KokkosBlas_Cuda_tpl.hpp index cf51341471..d85785316e 100644 --- a/blas/tpls/KokkosBlas_Cuda_tpl.hpp +++ b/blas/tpls/KokkosBlas_Cuda_tpl.hpp @@ -39,26 +39,4 @@ CudaBlasSingleton& CudaBlasSingleton::singleton() { } // namespace KokkosBlas #endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUBLAS) -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) -#include - -namespace KokkosBlas { -namespace Impl { - -MagmaSingleton::MagmaSingleton() { - magma_int_t stat = magma_init(); - if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); -} - -MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; - return s; -} - -} // namespace Impl -} // namespace KokkosBlas -#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) - #endif // KOKKOSBLAS_CUDA_TPL_HPP_ diff --git a/blas/tpls/KokkosBlas_Magma_tpl.hpp b/blas/tpls/KokkosBlas_Magma_tpl.hpp new file mode 100644 index 0000000000..f149a790df --- /dev/null +++ b/blas/tpls/KokkosBlas_Magma_tpl.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBLAS_MAGMA_TPL_HPP_ +#define KOKKOSBLAS_MAGMA_TPL_HPP_ + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#include + +namespace KokkosBlas { +namespace Impl { + +MagmaSingleton::MagmaSingleton() { + magma_int_t stat = magma_init(); + if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { magma_finalize(); }); +} + +MagmaSingleton& MagmaSingleton::singleton() { + static MagmaSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosBlas +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + +#endif // KOKKOSBLAS_MAGMA_TPL_HPP_ diff --git a/blas/tpls/KokkosBlas_magma.hpp b/blas/tpls/KokkosBlas_magma.hpp new file mode 100644 index 0000000000..5f5fcfe4e1 --- /dev/null +++ b/blas/tpls/KokkosBlas_magma.hpp @@ -0,0 +1,37 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_MAGMA_HPP_ +#define KOKKOSBLAS_MAGMA_HPP_ + +// If LAPACK TPL is enabled, it is preferred over magma's LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "magma_v2.h" + +namespace KokkosBlas { +namespace Impl { + +struct MagmaSingleton { + MagmaSingleton(); + + static MagmaSingleton& singleton(); +}; + +} // namespace Impl +} // namespace KokkosBlas +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA + +#endif // KOKKOSBLAS_MAGMA_HPP_ diff --git a/blas/tpls/KokkosBlas_tpl_spec.hpp b/blas/tpls/KokkosBlas_tpl_spec.hpp index a1eee4b69c..0151c0534f 100644 --- a/blas/tpls/KokkosBlas_tpl_spec.hpp +++ b/blas/tpls/KokkosBlas_tpl_spec.hpp @@ -214,21 +214,4 @@ inline rocblas_operation trans_mode_kk_to_rocblas(const char kkMode[]) { #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -// If LAPACK TPL is enabled, it is preferred over magma's LAPACK -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include "magma_v2.h" - -namespace KokkosBlas { -namespace Impl { - -struct MagmaSingleton { - MagmaSingleton(); - - static MagmaSingleton& singleton(); -}; - -} // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA - #endif // KOKKOSBLAS_TPL_SPEC_HPP_ diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.cpp b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp index 2ac28871a4..7f87eed0d5 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp @@ -16,3 +16,4 @@ #include #include #include +#include diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 6749a4740f..943d10d111 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -39,26 +39,4 @@ CudaLapackSingleton& CudaLapackSingleton::singleton() { } // namespace KokkosLapack #endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) -#include - -namespace KokkosLapack { -namespace Impl { - -MagmaSingleton::MagmaSingleton() { - magma_int_t stat = magma_init(); - if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); -} - -MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; - return s; -} - -} // namespace Impl -} // namespace KokkosLapack -#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) - #endif // KOKKOSLAPACK_CUDA_TPL_HPP_ diff --git a/lapack/tpls/KokkosLapack_Magma_tpl.hpp b/lapack/tpls/KokkosLapack_Magma_tpl.hpp new file mode 100644 index 0000000000..636c40735d --- /dev/null +++ b/lapack/tpls/KokkosLapack_Magma_tpl.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_MAGMA_TPL_HPP_ +#define KOKKOSLAPACK_MAGMA_TPL_HPP_ + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#include + +namespace KokkosLapack { +namespace Impl { + +MagmaSingleton::MagmaSingleton() { + magma_int_t stat = magma_init(); + if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { magma_finalize(); }); +} + +MagmaSingleton& MagmaSingleton::singleton() { + static MagmaSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + +#endif // KOKKOSLAPACK_MAGMA_TPL_HPP_ diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 41592e079a..ca4b9e7abc 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -155,7 +155,7 @@ KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include +#include namespace KokkosLapack { namespace Impl { From c00555a373c40181dfdb619d4610439c4c098d09 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 23 Apr 2024 16:19:47 -0600 Subject: [PATCH 232/326] magma: fix linker errors for builds without cusolver (#2181) * magma: fix linker errors for builds without cusolver * BatchedGemm test: workaround testing cublas+magma - temporary workaround to skip magma test when cublas enabled to avoid issues like #2177 --- .../unit_test/Test_Batched_BatchedGemm.hpp | 6 +++++- lapack/CMakeLists.txt | 6 ++++++ lapack/tpls/KokkosLapack_Cuda_tpl.cpp | 1 - lapack/tpls/KokkosLapack_Magma_tpl.cpp | 18 ++++++++++++++++++ 4 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 lapack/tpls/KokkosLapack_Magma_tpl.cpp diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index d57e671908..3c00b4f477 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -229,7 +229,11 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ASSERT_EQ(batchedGemmHandleCublas.vecLen, 0); #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + // FIXME temporary workaround to run this magma test only if cublas is not + // enabled the design of the BatchedGemmHandle currently does not allow + // simultanous testing in this way. See issue #2177 +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) magma_queue_t magma_queue; BatchedGemmHandle batchedGemmHandleMagma(magma_queue, GemmTplAlgos::MAGMA, 0, 0); diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index f825a2184a..804a2b7542 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -34,6 +34,12 @@ IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) ) ENDIF() +IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Magma_tpl.cpp + ) +ENDIF() + ################## # # # ETI generation # diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.cpp b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp index 7f87eed0d5..2ac28871a4 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp @@ -16,4 +16,3 @@ #include #include #include -#include diff --git a/lapack/tpls/KokkosLapack_Magma_tpl.cpp b/lapack/tpls/KokkosLapack_Magma_tpl.cpp new file mode 100644 index 0000000000..73add8d9e0 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Magma_tpl.cpp @@ -0,0 +1,18 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#include +#include +#include From 0a9b4e782ddfc501ad5d2bc03c42b006d89ebb09 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 Apr 2024 10:08:45 -0600 Subject: [PATCH 233/326] .github/workflows/mi210: Fix include paths --- .github/workflows/mi210.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index a52e892ce7..7f4518b244 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -153,7 +153,7 @@ jobs: HIPCC=$(which hipcc) cmake -DCMAKE_CXX_COMPILER=$HIPCC \ -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 " \ + -DCMAKE_CXX_FLAGS="-O3 -I$ROCM_CORE_ROOT/include" \ -DCMAKE_INSTALL_PREFIX= \ -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ -DKokkosKernels_ENABLE_TESTS=ON \ From 063cf77e069fa3e49edaa8742fcf5930eb7c67e4 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 24 Apr 2024 15:58:40 -0600 Subject: [PATCH 234/326] workflows/osx.yml: test against most recent kokkos tag - test against most recent kokkos release rather than develop branch, as done with AT CI, to avoid compatibility breakages --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index a2b2d8c830..9f05579fa5 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -58,7 +58,7 @@ jobs: uses: actions/checkout@v4 with: repository: kokkos/kokkos - ref: develop + ref: 4.3.00 path: kokkos - name: configure_kokkos From bac0b1ac9a28c80f4fb486ae9f9c4a69066000b0 Mon Sep 17 00:00:00 2001 From: Malachi Date: Thu, 25 Apr 2024 09:57:59 -0500 Subject: [PATCH 235/326] Resolve vortex compilation issue by resolving (potentially) duplicate symbol (#2183) Stick to pattern of removing leading 'c' or 'z' in method name and relying on the template type Co-authored-by: malphil --- .../KokkosBlas2_syr2_tpl_spec_decl_blas.hpp | 4 ++-- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 4 ++-- blas/tpls/KokkosBlas_Host_tpl.cpp | 24 ++++++++++--------- blas/tpls/KokkosBlas_Host_tpl.hpp | 15 ++++-------- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp index 8561675c72..f22e800bc5 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp @@ -163,7 +163,7 @@ namespace Impl { ETI_SPEC_AVAIL>::syr2(space, trans, uplo, alpha, X, Y, A); \ } else { \ if (A_is_ll) { \ - HostBlas>::zher2( \ + HostBlas>::her2( \ uplo[0], N, alpha, \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one, \ @@ -220,7 +220,7 @@ namespace Impl { ETI_SPEC_AVAIL>::syr2(space, trans, uplo, alpha, X, Y, A); \ } else { \ if (A_is_ll) { \ - HostBlas>::cher2( \ + HostBlas>::her2( \ uplo[0], N, alpha, \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one, \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 6b64fce2bc..fc8fb949d7 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -139,7 +139,7 @@ namespace Impl { space, trans, uplo, alpha, X, A); \ } else { \ if (A_is_ll) { \ - HostBlas>::zher( \ + HostBlas>::her( \ uplo[0], N, alpha.real(), \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ @@ -188,7 +188,7 @@ namespace Impl { space, trans, uplo, alpha, X, A); \ } else { \ if (A_is_ll && (alpha.imag() == 0.)) { \ - HostBlas>::cher( \ + HostBlas>::her( \ uplo[0], N, alpha.real(), \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 50aab57c73..dc04ca7e67 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -295,10 +295,10 @@ void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); /// /// Syr2 @@ -322,12 +322,12 @@ void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); /// /// Trsv @@ -901,14 +901,14 @@ void HostBlas >::gerc( } template <> template <> -void HostBlas >::cher( +void HostBlas >::her( const char uplo, KK_INT n, const float alpha, const std::complex* x, KK_INT incx, std::complex* a, KK_INT lda) { F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::cher2( +void HostBlas >::her2( const char uplo, KK_INT n, const std::complex alpha, const std::complex* x, KK_INT incx, const std::complex* y, KK_INT incy, std::complex* a, KK_INT lda) { @@ -1069,15 +1069,17 @@ void HostBlas >::gerc( } template <> template <> -void HostBlas >::zher( - const char uplo, KK_INT n, const double alpha, - const std::complex* x, KK_INT incx, std::complex* a, - KK_INT lda) { +void HostBlas >::her(const char uplo, KK_INT n, + const double alpha, + const std::complex* x, + KK_INT incx, + std::complex* a, + KK_INT lda) { F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::zher2( +void HostBlas >::her2( const char uplo, KK_INT n, const std::complex alpha, const std::complex* x, KK_INT incx, const std::complex* y, KK_INT incy, std::complex* a, KK_INT lda) { diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 5fb7c1f624..d28f7a2186 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -90,18 +90,11 @@ struct HostBlas { KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); template - static void cher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); - - template - static void zher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); - - static void cher2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void her(const char uplo, KK_INT n, const tAlpha alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); - static void zher2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void her2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); static void trsv(const char uplo, const char transa, const char diag, KK_INT m, const T *a, KK_INT lda, From b87dc951f64463fe5d6d4d886c0496ee804b2715 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 25 Apr 2024 13:13:36 -0600 Subject: [PATCH 236/326] Changes to enable OneAPI usage with Trilinos build (#2185) - Get rid of SYCL_OVERRIDE setting - Add MKL_PROVIDES_BLAS_LAPACK cmake variable and define so that code knows whether to use int, or MKL_INT - Trilinos builds might link with OneAPI for GPU but standard BLAS/LAPACK on CPU --- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 3 +-- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 3 +-- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 4 +--- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 4 +--- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 3 +-- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 4 +--- blas/tpls/KokkosBlas_Host_tpl.hpp | 2 +- cmake/KokkosKernels_config.h.in | 4 +++- cmake/Modules/FindTPLBLAS.cmake | 1 - cmake/Modules/FindTPLLAPACK.cmake | 1 - cmake/Modules/FindTPLMKL.cmake | 4 ++++ cmake/kokkoskernels_tpls.cmake | 4 ---- sparse/src/KokkosSparse_spmv_handle.hpp | 3 +-- sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 3 +-- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 3 +-- 15 files changed, 17 insertions(+), 29 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index be0a45c7be..8f79c8a58d 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -116,8 +116,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, // oneMKL #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#if defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ template \ diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index c695eaee1e..12a240db6b 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -334,8 +334,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, // oneMKL #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#if defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #include #include diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index de930f6107..7bc55becc0 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -88,9 +88,7 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ - defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #endif diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 736523aa8d..ef45238405 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -364,9 +364,7 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ - defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 0820badd9a..661393e445 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -161,8 +161,7 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#if defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ template \ diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 2ace065808..304dd349bf 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -767,9 +767,7 @@ KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS // ONEMKL -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ - defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index d28f7a2186..f7fb3d3978 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -32,7 +32,7 @@ namespace KokkosBlas { namespace Impl { -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(MKL_PROVIDES_BLAS_LAPACK) using KK_INT = MKL_INT; #else using KK_INT = int; diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index ef8fea78b8..9f6a0b85d5 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -29,7 +29,6 @@ requires (a) header file(s) as well, and may use functions other than just BLAS and LAPACK functions. */ #cmakedefine HAVE_KOKKOSKERNELS_MKL -#cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE #cmakedefine KOKKOSKERNELS_ENABLE_BENCHMARK @@ -154,6 +153,9 @@ #endif #endif +/* Whether MKL is providing the BLAS and LAPACK implementation */ +#cmakedefine MKL_PROVIDES_BLAS_LAPACK + #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOSKERNELS_ENABLE_HOST_ONLY diff --git a/cmake/Modules/FindTPLBLAS.cmake b/cmake/Modules/FindTPLBLAS.cmake index 0bc73fc73f..67e4cc9a08 100644 --- a/cmake/Modules/FindTPLBLAS.cmake +++ b/cmake/Modules/FindTPLBLAS.cmake @@ -8,4 +8,3 @@ ELSE() FIND_PACKAGE(BLAS REQUIRED) KOKKOSKERNELS_CREATE_IMPORTED_TPL(BLAS INTERFACE LINK_LIBRARIES ${BLAS_LIBRARIES}) ENDIF() - diff --git a/cmake/Modules/FindTPLLAPACK.cmake b/cmake/Modules/FindTPLLAPACK.cmake index 463f61afeb..f6d345d5ee 100644 --- a/cmake/Modules/FindTPLLAPACK.cmake +++ b/cmake/Modules/FindTPLLAPACK.cmake @@ -8,4 +8,3 @@ ELSE() FIND_PACKAGE(LAPACK REQUIRED) KOKKOSKERNELS_CREATE_IMPORTED_TPL(LAPACK INTERFACE LINK_LIBRARIES ${LAPACK_LIBRARIES}) ENDIF() - diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 52f4571976..1ecd882e71 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -74,3 +74,7 @@ ELSE() ) ENDIF() ENDIF() +# This logic to find MKL is only used in non-Trilinos builds. +# In this case, MKL can always be used as the host BLAS/LAPACK implementation +# (whether MKL_INT is 32- or 64-bit). +set (MKL_PROVIDES_BLAS_LAPACK ON INTERNAL) diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index d1a44721e6..6af952ce94 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -31,10 +31,6 @@ MACRO(KOKKOSKERNELS_ADD_TPL_OPTION NAME DEFAULT_VALUE DOCSTRING) SET(ROOT_DEFAULT $ENV{${_NAME_ORIG}_ROOT}) KOKKOSKERNELS_ADD_OPTION(${_NAME_ORIG}_ROOT "${ROOT_DEFAULT}" PATH "Location of ${_NAME} install root. Default: None or the value of the environment variable ${_NAME}_ROOT if set") IF (DEFINED TPL_ENABLE_${_NAME}) - IF (${_NAME} STREQUAL MKL AND KOKKOSKERNELS_HAS_TRILINOS) - MESSAGE("Trilinos has enabled MKL and SYCL but it does not detect oneMKL correctly so we disable it!") - SET(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE ON) - ENDIF () IF (TPL_ENABLE_${_NAME} AND NOT KOKKOSKERNELS_ENABLE_TPL_${_NAME}) MESSAGE("Overriding KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG}=OFF with TPL_ENABLE_${_NAME}=ON") SET(KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG} ON) diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index a2eecfd1ce..b3e878b5e9 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -189,8 +189,7 @@ struct MKL_SpMV_Data : public TPL_SpMV_Data { }; #endif -#if defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) struct OneMKL_SpMV_Data : public TPL_SpMV_Data { OneMKL_SpMV_Data(const Kokkos::Experimental::SYCL& exec_) : TPL_SpMV_Data(exec_) {} diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 854c2f2b26..881352d950 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -257,8 +257,7 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif -#if defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 1555050420..66ea90c746 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -678,8 +678,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP) #undef KOKKOSSPARSE_SPMV_MKL #endif -#if defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { switch (toupper(mode_kk)) { case 'N': return oneapi::mkl::transpose::nontrans; From 88ae8f1489e390578485c9c274a2129d38da5301 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 2 May 2024 08:23:57 -0600 Subject: [PATCH 237/326] Fix macOS docs build (#2190) * Fix docs build * try docs fix * make sphinx available at config time --- .github/workflows/docs.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 04a1ba74b2..1646300e81 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -16,9 +16,11 @@ jobs: - name: Install Dependencies run: | brew install doxygen - python3 -m pip install sphinx -v "sphinx==6.2.1" - python3 -m pip install breathe - python3 -m pip install sphinx-rtd-theme + python3 -m venv .venv + . .venv/bin/activate + pip install sphinx -v "sphinx==6.2.1" + pip install breathe + pip install sphinx-rtd-theme sphinx-build --version doxygen --version @@ -52,8 +54,10 @@ jobs: working-directory: kokkos/build run: make -j2 install + # sphinx needs to be available at configure time for the target to be generated - name: configure_kokkos_kernels run: | + . .venv/bin/activate mkdir -p kokkos-kernels/{build,install} cd kokkos-kernels/build cmake \ @@ -81,5 +85,7 @@ jobs: fi - name: build_kokkos_kernels_sphinx - working-directory: kokkos-kernels/build - run: make Sphinx + run: | + . .venv/bin/activate + cd kokkos-kernels/build + make Sphinx From 2454f8d56dfe48bf56803c74c7aec5d2d5d63022 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 2 May 2024 16:37:34 -0600 Subject: [PATCH 238/326] GH-Actions: adding security actions and scorecard (#2192) Pretty much taking the new files from PR #2191 and re-creating it in a clean PR on top of develop with small changes relevant to our repository. Cleaning up some workflows to tailor it for our needs --- .github/dependabot.yml | 11 +++ .github/workflows/codeql.yml | 105 ++++++++++++++++++++++++ .github/workflows/dependency-review.yml | 27 ++++++ .github/workflows/scorecards.yml | 76 +++++++++++++++++ 4 files changed, 219 insertions(+) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/codeql.yml create mode 100644 .github/workflows/dependency-review.yml create mode 100644 .github/workflows/scorecards.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..74999af711 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + + - package-ecosystem: pip + directory: /docs + schedule: + interval: weekly diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..079acb93bf --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,105 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: ["master", "develop", "release-*"] + pull_request: + # The branches below must be a subset of the branches above + branches: ["develop"] + schedule: + - cron: "0 8 * * 0" + +permissions: read-all + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + steps: + - name: Harden Runner + uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + with: + egress-policy: audit + + - name: checkout_kokkos_kernels + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + with: + path: kokkos-kernels + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@ceaec5c11a131e0d282ff3b6f095917d234caace # v2.25.3 + with: + languages: c-cpp + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + - name: checkout_kokkos + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + with: + repository: 'kokkos/kokkos' + path: 'kokkos' + ref: '4.3.00' + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake -S ${{github.workspace}}/kokkos \ + -B ${{github.workspace}}/kokkos/build \ + -D Kokkos_ENABLE_SERIAL=ON \ + -D CMAKE_CXX_FLAGS="-Werror" \ + -D CMAKE_CXX_STANDARD=17 \ + -D Kokkos_ENABLE_COMPILER_WARNINGS=ON \ + -D Kokkos_ENABLE_TESTS=OFF \ + -D Kokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -D CMAKE_BUILD_TYPE=RELEASE \ + -D CMAKE_INSTALL_PREFIX=${{github.workspace}}/kokkos/install + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j2 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -S ${{github.workspace}}/kokkos-kernels \ + -DKokkos_ROOT=${{github.workspace}}/kokkos/install \ + -DCMAKE_BUILD_TYPE=RELEASE \ + -DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/kokkos-kernels/install \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_LAYOUTLEFT:BOOL=ON \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=OFF \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=OFF + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@ceaec5c11a131e0d282ff3b6f095917d234caace # v2.25.3 + with: + category: "/language:c-cpp" diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml new file mode 100644 index 0000000000..d084a0f10a --- /dev/null +++ b/.github/workflows/dependency-review.yml @@ -0,0 +1,27 @@ +# Dependency Review Action +# +# This Action will scan dependency manifest files that change as part of a Pull Request, +# surfacing known-vulnerable versions of the packages declared or updated in the PR. +# Once installed, if the workflow run is marked as required, +# PRs introducing known-vulnerable packages will be blocked from merging. +# +# Source repository: https://github.com/actions/dependency-review-action +name: 'Dependency Review' +on: [pull_request] + +permissions: + contents: read + +jobs: + dependency-review: + runs-on: ubuntu-latest + steps: + - name: Harden Runner + uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + with: + egress-policy: audit + + - name: 'Checkout Repository' + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + - name: 'Dependency Review' + uses: actions/dependency-review-action@0efb1d1d84fc9633afcdaad14c485cbbc90ef46c # v2.5.1 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml new file mode 100644 index 0000000000..583dd083f9 --- /dev/null +++ b/.github/workflows/scorecards.yml @@ -0,0 +1,76 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + - cron: '0 8 * * 0' + push: + branches: ["develop"] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + contents: read + actions: read + + steps: + - name: Harden Runner + uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + with: + egress-policy: audit + + - name: "Checkout code" + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecards on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard. + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@ceaec5c11a131e0d282ff3b6f095917d234caace # v2.25.3 + with: + sarif_file: results.sarif From 66ef193df29433194f7f6fd6cb41a7cc2fa9af6d Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 2 May 2024 17:02:43 -0600 Subject: [PATCH 239/326] Scorecard: adding manual dispatch and target default branch (#2195) --- .github/workflows/scorecards.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 583dd083f9..7ee2c9c2cf 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -12,7 +12,9 @@ on: schedule: - cron: '0 8 * * 0' push: - branches: ["develop"] + branches: [ "master", "develop"] + workflow_dispatch: + # Declare default permissions as read only. permissions: read-all From a5794bbad9e80d080c2678922a65e17801aa096b Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 3 May 2024 14:05:32 -0600 Subject: [PATCH 240/326] BsrMatrix: Fix HostMirror typedef (#2196) It needed to have size_type. --- sparse/src/KokkosSparse_BsrMatrix.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index 06a9ad92cf..08df2c55a9 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -361,7 +361,8 @@ class BsrMatrix { typedef SizeType size_type; //! Type of a host-memory mirror of the sparse matrix. - typedef BsrMatrix + typedef BsrMatrix HostMirror; //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph Date: Tue, 7 May 2024 11:27:48 -0600 Subject: [PATCH 241/326] update changelog for 4.3.1 --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e6da70740..4f845771c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Change Log +## [4.3.01](https://github.com/kokkos/kokkos-kernels/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.3.00...4.3.01) + +### Bug Fixes: +- sparse: block spiluk fixes [\#2172](https://github.com/kokkos/kokkos-kernels/pull/2172) +- magma: tpl interaction fixes [\#2176](https://github.com/kokkos/kokkos-kernels/pull/2176), [\#2178](https://github.com/kokkos/kokkos-kernels/pull/2178), [\#2181](https://github.com/kokkos/kokkos-kernels/pull/2181) +- trsv: Add early return if numRows == 0 in trsv to avoid integer divide-by-zero error [\#2180](https://github.com/kokkos/kokkos-kernels/pull/2180) +- blas tpl: resolve potential duplicate symbol [\#2183](https://github.com/kokkos/kokkos-kernels/pull/2183) +- spmv: permformance fix, add special path for rank-2 x/y [\#2164](https://github.com/kokkos/kokkos-kernels/pull/2164), [\#2168](https://github.com/kokkos/kokkos-kernels/pull/2168) +- BsrMatrix: Fix HostMirror typedef [\#2196](https://github.com/kokkos/kokkos-kernels/pull/2196) +- GA: Fix macOS docs build [\#2190](https://github.com/kokkos/kokkos-kernels/pull/2190) + ## [4.3.00](https://github.com/kokkos/kokkos-kernels/tree/4.3.00) (2024-03-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.2.01...4.3.00) From bf2c28fb482b4a525d93b0181d2e0f7cee00dc33 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 7 May 2024 11:52:55 -0600 Subject: [PATCH 242/326] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f845771c7..9cb40b5e74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ - magma: tpl interaction fixes [\#2176](https://github.com/kokkos/kokkos-kernels/pull/2176), [\#2178](https://github.com/kokkos/kokkos-kernels/pull/2178), [\#2181](https://github.com/kokkos/kokkos-kernels/pull/2181) - trsv: Add early return if numRows == 0 in trsv to avoid integer divide-by-zero error [\#2180](https://github.com/kokkos/kokkos-kernels/pull/2180) - blas tpl: resolve potential duplicate symbol [\#2183](https://github.com/kokkos/kokkos-kernels/pull/2183) -- spmv: permformance fix, add special path for rank-2 x/y [\#2164](https://github.com/kokkos/kokkos-kernels/pull/2164), [\#2168](https://github.com/kokkos/kokkos-kernels/pull/2168) +- spmv: permformance fix, add back special path for rank-2 x/y with 1 column [\#2164](https://github.com/kokkos/kokkos-kernels/pull/2164), [\#2168](https://github.com/kokkos/kokkos-kernels/pull/2168) - BsrMatrix: Fix HostMirror typedef [\#2196](https://github.com/kokkos/kokkos-kernels/pull/2196) - GA: Fix macOS docs build [\#2190](https://github.com/kokkos/kokkos-kernels/pull/2190) From cfc409cdad14d9a45627b7c21c5e8d286c02719f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 7 May 2024 12:49:28 -0600 Subject: [PATCH 243/326] docs.yml: change kokkos version to latest release - avoid version range check issues for release tests --- .github/workflows/docs.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 1646300e81..b2bf72d2ef 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -33,7 +33,7 @@ jobs: uses: actions/checkout@v4 with: repository: kokkos/kokkos - ref: develop + ref: 4.3.00 path: kokkos - name: configure_kokkos @@ -45,7 +45,6 @@ jobs: -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ -DKokkos_ENABLE_TESTS=OFF \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ .. From d7dace12a55e83a57eb5db56ce2312ea62386134 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 9 May 2024 15:56:17 -0600 Subject: [PATCH 244/326] Sparse - SpGEMM: labeling spgemm_symbolic in TPL layer a bit more clearly (#2193) This just improves the readability of the output from the tools as it now has a symbolic matching the numeric phase. Previously we only had spgemm in the label which is a bit confusing as it could be the whole spgemm time i.e. both symbolic and numeric, additionally we had symbolic in the MKL path but not in cusparse, rocsparse... --- sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index e662934d00..a718769c7b 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -387,7 +387,7 @@ void spgemm_symbolic_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k, bool, c_int_view_t row_mapB, \ c_int_view_t entriesB, bool, \ int_view_t row_mapC, bool computeRowptrs) { \ - std::string label = "KokkosSparse::spgemm[TPL_CUSPARSE," + \ + std::string label = "KokkosSparse::spgemm_symbolic[TPL_CUSPARSE," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spgemm_symbolic_cusparse(handle->get_spgemm_handle(), m, n, k, row_mapA, \ @@ -549,7 +549,7 @@ void spgemm_symbolic_rocsparse( bool, c_int_view_t row_mapB, \ c_int_view_t entriesB, bool, \ int_view_t row_mapC, bool) { \ - std::string label = "KokkosSparse::spgemm[TPL_ROCSPARSE," + \ + std::string label = "KokkosSparse::spgemm_symbolic[TPL_ROCSPARSE," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spgemm_symbolic_rocsparse(handle->get_spgemm_handle(), m, n, k, \ From 3414c914b11e5fc116b501de8e78c98fcdfb023c Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 22 May 2024 10:03:40 -0600 Subject: [PATCH 245/326] SpMV: Test NaN, fix NaN handling when beta=0 (#2188) * Test_Sparse_spmv_bsr.hpp: add NaNs to tests * handle NaN in spmv_beta_transpose when beta=0 * handle nan in SpmvMergeHierarchical when beta=0 * Test NaNs in Y, don't reuse modifed Y, catch NaNs in results test * remove unused include * explicit casting of zero * Test_sparse_spmv.hpp: remove unused nans parameter * KokkosSparse_spmv.hpp: CUDA11 can't detect this function always returns * Test_Sparse_spmv.hpp: remove unused variable * Run unit tests in correct execution space * Test_Sparse_spmv.hpp: remove unused type aliases * Kokkos::nan() -> KokkosKernels::Impl::quiet_NaN() --- common/impl/KokkosKernels_NaN.hpp | 44 ++++++++++++ sparse/impl/KokkosSparse_spmv_impl.hpp | 18 +++-- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 6 +- sparse/src/KokkosSparse_spmv.hpp | 1 + sparse/unit_test/Test_Sparse_spmv.hpp | 76 +++++++++++++++----- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 62 +++++++++++++++- 6 files changed, 178 insertions(+), 29 deletions(-) create mode 100644 common/impl/KokkosKernels_NaN.hpp diff --git a/common/impl/KokkosKernels_NaN.hpp b/common/impl/KokkosKernels_NaN.hpp new file mode 100644 index 0000000000..f319539a9f --- /dev/null +++ b/common/impl/KokkosKernels_NaN.hpp @@ -0,0 +1,44 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_NAN_HPP +#define KOKKOSKERNELS_NAN_HPP + +#include +#include + +namespace KokkosKernels::Impl { + +// This could be constexpr if Kokkos::complex ctor was +template +KOKKOS_INLINE_FUNCTION T quiet_NaN() { + if constexpr (std::is_same_v) { + return double(Kokkos::Experimental::quiet_NaN_v< + float>); // Kokkos::Experimetnal::quiet_NaN_v + // is undefined in + // device code + } else if constexpr (Kokkos::ArithTraits::is_complex) { + using value_type = typename T::value_type; + return T(quiet_NaN(), + quiet_NaN()); // Kokkos::complex ctor is not constexpr + } else { + return Kokkos::Experimental::quiet_NaN_v; + } +} + +} // namespace KokkosKernels::Impl + +#endif // KOKKOSKERNELS_NAN_HPP diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 5f9cbea040..a2bb19a44c 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -450,8 +450,9 @@ static void spmv_beta_transpose(const execution_space& exec, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { - using ordinal_type = typename AMatrix::non_const_ordinal_type; - using size_type = typename AMatrix::non_const_size_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; + using y_scalar_type = typename YVector::non_const_value_type; if (A.numRows() <= static_cast(0)) { return; @@ -459,7 +460,9 @@ static void spmv_beta_transpose(const execution_space& exec, // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. - if (dobeta != 1) { + if (0 == dobeta || y_scalar_type(0) == beta) { + Kokkos::deep_copy(exec, y, y_scalar_type(0)); + } else if (dobeta != 1) { KokkosBlas::scal(exec, y, beta, y); } @@ -540,8 +543,9 @@ static void spmv_beta_transpose(const execution_space& exec, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { - using ordinal_type = typename AMatrix::non_const_ordinal_type; - using size_type = typename AMatrix::non_const_size_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; + using y_scalar_type = typename YVector::non_const_value_type; if (A.numRows() <= static_cast(0)) { return; @@ -549,7 +553,9 @@ static void spmv_beta_transpose(const execution_space& exec, // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. - if (dobeta != 1) { + if (0 == dobeta || y_scalar_type(0) == beta) { + Kokkos::deep_copy(exec, y, y_scalar_type(0)); + } else if (dobeta != 1) { KokkosBlas::scal(exec, y, beta, y); } diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index c49519cc3a..1717d190be 100644 --- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -309,7 +309,11 @@ struct SpmvMergeHierarchical { static_assert(XVector::rank == 1, ""); static_assert(YVector::rank == 1, ""); - KokkosBlas::scal(y, beta, y); + if (y_value_type(0) == beta) { + Kokkos::deep_copy(space, y, y_value_type(0)); + } else { + KokkosBlas::scal(space, y, beta, y); + } /* determine launch parameters for different architectures On architectures where there is a natural execution hierarchy with true diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index f11b61f675..b59124df20 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -62,6 +62,7 @@ inline constexpr bool spmv_general_tpl_avail() { return spmv_mv_bsrmatrix_tpl_spec_avail::value; } + return false; } } // namespace Impl diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 9afd941c93..2057a8ba14 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "KokkosKernels_default_types.hpp" @@ -86,7 +87,10 @@ struct fSPMV { void operator()(const int i, value_type &err) const { const mag_type error = AT::abs(expected_y(i) - y(i)); - if (error > eps * max_val) { + // only one is NaN or error is too large + if ((Kokkos::isnan(AT::abs(expected_y(i))) ^ + Kokkos::isnan(AT::abs(y(i)))) || + (error > eps * max_val)) { err++; Kokkos::printf("expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)), error, @@ -116,6 +120,7 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using size_type_view_t = typename graph_t::row_map_type; using lno_view_t = typename graph_t::entries_type; using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using y_scalar_t = typename y_vector_type::non_const_value_type; using size_type = typename size_type_view_t::non_const_value_type; using lno_t = typename lno_view_t::non_const_value_type; @@ -145,7 +150,13 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, lno_t nr = input_mat.numRows(); // first, scale y by beta - for (size_t i = 0; i < h_y.extent(0); i++) h_y(i) *= beta; + for (size_t i = 0; i < h_y.extent(0); i++) { + if (beta == y_scalar_t(0)) { + h_y(i) = y_scalar_t(0); + } else { + h_y(i) *= beta; + } + } // then go through the matrix and accumulate the matrix-vector product for (lno_t row = 0; row < nr; ++row) { @@ -184,17 +195,19 @@ void check_spmv( const y_value_mag_type eps = 10 * Kokkos::ArithTraits::eps(); - bool transposed = (mode == "T") || (mode == "H"); - y_vector_type expected_y( - "expected", transposed ? input_mat.numCols() : input_mat.numRows()); + + y_vector_type actual_y("actual_y", y.extent(0)); + y_vector_type expected_y("expected_y", y.extent(0)); Kokkos::deep_copy(expected_y, y); + Kokkos::deep_copy(actual_y, y); Kokkos::fence(); sequential_spmv(input_mat, x, expected_y, alpha, beta, mode); bool threw = false; std::string msg; try { - KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta, y); + KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta, + actual_y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -203,11 +216,11 @@ void check_spmv( ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 1D, mode " << mode << ": threw exception:\n" << msg << '\n'; + int num_errors = 0; Kokkos::parallel_reduce( - "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps, max_val), - num_errors); + "KokkosSparse::Test::spmv", my_exec_space(0, actual_y.extent(0)), + fSPMV(expected_y, actual_y, eps, max_val), num_errors); if (num_errors > 0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", num_errors, y.extent_int(0), y_value_trait::abs(alpha), @@ -266,10 +279,9 @@ void check_spmv_mv( auto y_spmv = Kokkos::subview(y, Kokkos::ALL(), i); int num_errors = 0; - Kokkos::parallel_reduce( - "KokkosSparse::Test::spmv_mv", my_exec_space(0, y_i.extent(0)), - fSPMV(y_i, y_spmv, eps, max_val), - num_errors); + Kokkos::parallel_reduce("KokkosSparse::Test::spmv_mv", + my_exec_space(0, y_i.extent(0)), + fSPMV(y_i, y_spmv, eps, max_val), num_errors); if (num_errors > 0) std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0) << " for mv " << i @@ -404,6 +416,7 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz, using mag_t = typename Kokkos::ArithTraits::mag_type; using handle_t = KokkosSparse::SPMVHandle; + using y_policy = Kokkos::RangePolicy; constexpr mag_t max_x = static_cast(1); constexpr mag_t max_y = static_cast(1); @@ -419,18 +432,35 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz, const lno_t max_nnz_per_row = numRows ? (nnz / numRows + row_size_variance) : 0; + // Create vectors with and without nans x_vector_type input_x("x", nc); - y_vector_type output_y("y", nr); x_vector_type input_xt("x", nr); - y_vector_type output_yt("y", nc); + y_vector_type input_y("y", nr), input_y_nans("y_nans", nr); + y_vector_type input_yt("y", nc), input_yt_nans("y_nans", nc); Kokkos::Random_XorShift64_Pool rand_pool( 13718); Kokkos::fill_random(input_x, rand_pool, randomUpperBound(max_x)); - Kokkos::fill_random(output_y, rand_pool, randomUpperBound(max_y)); + Kokkos::fill_random(input_y, rand_pool, randomUpperBound(max_y)); Kokkos::fill_random(input_xt, rand_pool, randomUpperBound(max_x)); - Kokkos::fill_random(output_yt, rand_pool, randomUpperBound(max_y)); + Kokkos::fill_random(input_yt, rand_pool, randomUpperBound(max_y)); + + // sprinkle in some nans + Kokkos::deep_copy(input_y_nans, input_y); + Kokkos::deep_copy(input_yt_nans, input_yt); + Kokkos::parallel_for( + y_policy(0, input_y_nans.extent(0)), KOKKOS_LAMBDA(const size_t i) { + if (0 == (i % 19)) { + input_y_nans(i) = KokkosKernels::Impl::quiet_NaN(); + } + }); + Kokkos::parallel_for( + y_policy(0, input_yt_nans.extent(0)), KOKKOS_LAMBDA(const size_t i) { + if (0 == (i % 23)) { + input_yt_nans(i) = KokkosKernels::Impl::quiet_NaN(); + } + }); // We also need to bound the values // in the matrix to bound the cancellations @@ -457,8 +487,12 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz, for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(&handle, input_mat, input_x, output_y, alpha, beta, + Test::check_spmv(&handle, input_mat, input_x, input_y, alpha, beta, mode, max_error); + if (0 == beta) { + Test::check_spmv(&handle, input_mat, input_x, input_y_nans, alpha, + beta, mode, max_error); + } } } } @@ -468,8 +502,12 @@ void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(&handle, input_mat, input_xt, output_yt, alpha, beta, + Test::check_spmv(&handle, input_mat, input_xt, input_yt, alpha, beta, mode, max_error); + if (0 == beta) { + Test::check_spmv(&handle, input_mat, input_x, input_yt_nans, alpha, + beta, mode, max_error); + } } } } diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index cb42f5c2e4..e9b23298f9 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -41,6 +41,7 @@ #include #include #include "KokkosKernels_default_types.hpp" +#include #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_BsrMatrix.hpp" @@ -327,10 +328,15 @@ spmv_random(const char *mode, const int blockSize, const int blockRows, /*! \brief create random x and y multivectors for a given matrix and spmv mode */ template -auto random_vecs_for_spmv(const char *mode, const Bsr &a) { +auto random_vecs_for_spmv(const char *mode, const Bsr &a, + const bool nans = false) + -> std::tuple::type, + typename VectorTypeFor::type> { using scalar_type = typename Bsr::non_const_value_type; using vector_type = typename VectorTypeFor::type; using execution_space = typename Bsr::execution_space; + using policy_type = + Kokkos::RangePolicy; size_t nx = a.numCols() * a.blockDim(); size_t ny = a.numRows() * a.blockDim(); @@ -344,6 +350,21 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a) { Kokkos::fill_random(x, random, max_x()); Kokkos::fill_random(y, random, max_y()); + if (nans) { + Kokkos::parallel_for( + policy_type(0, x.extent(0)), KOKKOS_LAMBDA(size_t i) { + if (0 == (i % 17)) { + x(i) = KokkosKernels::Impl::quiet_NaN(); + } + }); + Kokkos::parallel_for( + policy_type(0, y.extent(0)), KOKKOS_LAMBDA(size_t i) { + if (0 == (i % 17)) { + y(i) = KokkosKernels::Impl::quiet_NaN(); + } + }); + } + return std::make_tuple(x, y); } @@ -356,7 +377,8 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, using scalar_type = typename Bsr::non_const_value_type; using execution_space = typename Bsr::execution_space; - auto [x, y] = random_vecs_for_spmv(mode, a); + auto [x, y] = random_vecs_for_spmv(mode, a); + auto [x_with_nans, y_with_nans] = random_vecs_for_spmv(mode, a, true); using handle_t = SPMVHandle; @@ -389,6 +411,10 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + if (beta == scalar_type(0)) { + test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, + x_with_nans, y_with_nans); + } } } } @@ -563,10 +589,14 @@ struct MultiVectorTypeFor { */ template auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, - const size_t numVecs) { + const size_t numVecs, const bool nans = false) + -> std::tuple::type, + typename MultiVectorTypeFor::type> { using scalar_type = typename Bsr::non_const_value_type; using vector_type = typename MultiVectorTypeFor::type; using execution_space = typename Bsr::execution_space; + using policy_type = + Kokkos::RangePolicy; size_t nx = a.numCols() * a.blockDim(); size_t ny = a.numRows() * a.blockDim(); @@ -580,6 +610,26 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, Kokkos::fill_random(x, random, max_x()); Kokkos::fill_random(y, random, max_y()); + // sprinkle some "random" NaNs in + if (nans) { + Kokkos::parallel_for( + policy_type(0, x.extent(0)), KOKKOS_LAMBDA(size_t i) { + for (size_t j = 0; j < x.extent(1); ++j) { + if (0 == ((i * x.extent(1) + j) % 13)) { + x(i, j) = KokkosKernels::Impl::quiet_NaN(); + } + } + }); + Kokkos::parallel_for( + policy_type(0, y.extent(0)), KOKKOS_LAMBDA(size_t i) { + for (size_t j = 0; j < y.extent(1); ++j) { + if (0 == ((i * y.extent(1) + j) % 17)) { + y(i, j) = KokkosKernels::Impl::quiet_NaN(); + } + } + }); + } + return std::make_tuple(x, y); } @@ -618,12 +668,18 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, for (size_t numVecs : {1, 7}) { // num multivecs auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); + auto [x_with_nans, y_with_nans] = + random_multivecs_for_spm_mv(mode, a, numVecs, true); for (handle_t *handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + if (beta == scalar_type(0)) { + test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, + x_with_nans, y_with_nans); + } } } } From 62041513e3defcaba4ed78d8a6ca9ba8d91bb54c Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 22 May 2024 14:52:33 -0600 Subject: [PATCH 246/326] Disable cuBLAS dot wrapper (#2206) (not deleted, just guarded with #if 0 and comments explaining) It performs significantly worse than our native impl on 11.2, 11.8 and 12.0 on V100. This is in the dot perf test with a warm-up call. https://github.com/trilinos/Trilinos/issues/12982 was a symptom of this. --- blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 6 ++++++ blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 3ba8f063b4..13cc2a6f92 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -89,9 +89,15 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +// Note BMK: CUBLAS dot is consistently slower than our native dot +// (measured 11.2, 11.8, 12.0 using perf test, and all are similar) +// If a future version improves performance, re-enable it here and +// in the tpl_spec_decl file. +#if 0 KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) #endif +#endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index ace26ebdbd..247957b2c8 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -101,6 +101,9 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +// Disabled because native has better performance. +// See tpl_spec_avail file for more details +#if 0 #include namespace KokkosBlas { @@ -174,6 +177,7 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) } // namespace Impl } // namespace KokkosBlas #endif +#endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS From feb1f5576447c8d3f6bae399edad4dd8a2a8d830 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 22 May 2024 17:15:01 -0600 Subject: [PATCH 247/326] Fix spmv regressions (#2204) * Restore cusparse spmv ALG2 path for imbalanced With correct version cutoffs * spmv: use separate rank-1 and rank-2 tpl subhandles * Remove redundant single-column path in native spmv_mv * Fix unused param warning --- sparse/impl/KokkosSparse_spmv_spec.hpp | 56 ++++----------- sparse/src/KokkosSparse_spmv.hpp | 49 +++---------- sparse/src/KokkosSparse_spmv_handle.hpp | 9 +-- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 47 ++++++------- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 12 ++-- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 68 +++++++++++-------- 6 files changed, 90 insertions(+), 151 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index da02b1af5a..67a2f05639 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -203,54 +203,24 @@ struct SPMV_MV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const ExecutionSpace& space, Handle* handle, + // TODO: pass handle through to implementation and use tuning parameters + static void spmv_mv(const ExecutionSpace& space, Handle* /* handle */, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::ArithTraits KAT; - // Intercept special case: if x/y have only 1 column and both are - // contiguous, use the more efficient single-vector impl. - // - // We cannot do this if x or y is noncontiguous, because the column subview - // must be LayoutStride which is not ETI'd. - // - // Do not use a TPL even if one is available for the types: - // we don't want the same handle being used in both TPL and non-TPL versions - if (x.extent(1) == size_t(1) && x.span_is_contiguous() && - y.span_is_contiguous()) { - Kokkos::View - x0(x.data(), x.extent(0)); - Kokkos::View - y0(y.data(), y.extent(0)); - if (beta == KAT::zero()) { - spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); - } else if (beta == KAT::one()) { - spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); - } else if (beta == -KAT::one()) { - spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); - } else { - spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); - } + if (alpha == KAT::zero()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else if (alpha == KAT::one()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else if (alpha == -KAT::one()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } else { - if (alpha == KAT::zero()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } else if (alpha == KAT::one()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } else if (alpha == -KAT::one()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } else { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } } }; diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index b59124df20..336bae4f1d 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -40,32 +40,6 @@ struct RANK_ONE {}; struct RANK_TWO {}; } // namespace -namespace Impl { -template -inline constexpr bool spmv_general_tpl_avail() { - constexpr bool isBSR = ::KokkosSparse::Experimental::is_bsr_matrix_v; - if constexpr (!isBSR) { - // CRS - if constexpr (XVector::rank() == 1) - return spmv_tpl_spec_avail::value; - else - return spmv_mv_tpl_spec_avail::value; - } else { - // BSR - if constexpr (XVector::rank() == 1) - return spmv_bsrmatrix_tpl_spec_avail::value; - else - return spmv_mv_bsrmatrix_tpl_spec_avail::value; - } - return false; -} -} // namespace Impl - // clang-format off /// \brief Kokkos sparse matrix-vector multiply. /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is @@ -248,8 +222,8 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], typename YVector::device_type, Kokkos::MemoryTraits>; // Special case: XVector/YVector are rank-2 but x,y both have one column and - // are contiguous. If a TPL is available for rank-1 vectors but not rank-2, - // take rank-1 subviews of x,y and call the rank-1 version. + // are contiguous. In this case take rank-1 subviews of x,y and call the + // rank-1 version. if constexpr (XVector::rank() == 2) { using XVector_SubInternal = Kokkos::View< typename XVector::const_value_type*, @@ -260,19 +234,12 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], typename YVector::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename YVector::device_type, Kokkos::MemoryTraits>; - if constexpr (!Impl::spmv_general_tpl_avail< - ExecutionSpace, HandleImpl, AMatrix_Internal, - XVector_Internal, YVector_Internal>() && - Impl::spmv_general_tpl_avail< - ExecutionSpace, HandleImpl, AMatrix_Internal, - XVector_SubInternal, YVector_SubInternal>()) { - if (x.extent(1) == size_t(1) && x.span_is_contiguous() && - y.span_is_contiguous()) { - XVector_SubInternal xsub(x.data(), x.extent(0)); - YVector_SubInternal ysub(y.data(), y.extent(0)); - spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub); - return; - } + if (x.extent(1) == size_t(1) && x.span_is_contiguous() && + y.span_is_contiguous()) { + XVector_SubInternal xsub(x.data(), x.extent(0)); + YVector_SubInternal ysub(y.data(), y.extent(0)); + spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub); + return; } } diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index b3e878b5e9..6d23d2bde1 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -234,7 +234,8 @@ struct SPMVHandleImpl { "SPMVHandleImpl: Ordinal must not be a const type"); SPMVHandleImpl(SPMVAlgorithm algo_) : algo(algo_) {} ~SPMVHandleImpl() { - if (tpl) delete tpl; + if (tpl_rank1) delete tpl_rank1; + if (tpl_rank2) delete tpl_rank2; } ImplType* get_impl() { return this; } @@ -242,9 +243,9 @@ struct SPMVHandleImpl { /// Get the SPMVAlgorithm used by this handle SPMVAlgorithm get_algorithm() const { return this->algo; } - bool is_set_up = false; - const SPMVAlgorithm algo = SPMV_DEFAULT; - TPL_SpMV_Data* tpl = nullptr; + const SPMVAlgorithm algo = SPMV_DEFAULT; + TPL_SpMV_Data* tpl_rank1 = nullptr; + TPL_SpMV_Data* tpl_rank2 = nullptr; // Expert tuning parameters for native SpMV // TODO: expose a proper Experimental interface to set these. Currently they // can be assigned directly in the SPMVHandle as they are public members. diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 7eb6307753..3564fa68fd 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -43,8 +43,8 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, Subhandle* subhandle; const MKLScalar* x_mkl = reinterpret_cast(x); MKLScalar* y_mkl = reinterpret_cast(y); - if (handle->is_set_up) { - subhandle = dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast(handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); @@ -54,7 +54,7 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, // Use the default execution space instance, as classic MKL does not use // a specific instance. subhandle = new Subhandle(ExecSpace()); - handle->tpl = subhandle; + handle->tpl_rank1 = subhandle; subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; subhandle->descr.mode = SPARSE_FILL_MODE_FULL; subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; @@ -87,7 +87,6 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, const_cast(Arowptrs + 1), const_cast(Aentries), Avalues_mkl)); } - handle->is_set_up = true; } MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar(alpha); MKLScalar beta_mkl = KokkosSparse::Impl::KokkosToMKLScalar(beta); @@ -124,8 +123,8 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, Subhandle* subhandle; const MKLScalar* x_mkl = reinterpret_cast(x); MKLScalar* y_mkl = reinterpret_cast(y); - if (handle->is_set_up) { - subhandle = dynamic_cast(handle->tpl); + if (handle->tpl_rank2) { + subhandle = dynamic_cast(handle->tpl_rank2); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); @@ -135,7 +134,7 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, // Use the default execution space instance, as classic MKL does not use // a specific instance. subhandle = new Subhandle(ExecSpace()); - handle->tpl = subhandle; + handle->tpl_rank2 = subhandle; subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; subhandle->descr.mode = SPARSE_FILL_MODE_FULL; subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; @@ -168,7 +167,6 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, const_cast(Arowptrs + 1), const_cast(Aentries), Avalues_mkl)); } - handle->is_set_up = true; } MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar(alpha); MKLScalar beta_mkl = KokkosSparse::Impl::KokkosToMKLScalar(beta); @@ -376,23 +374,22 @@ void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; - if (handle->is_set_up) { - subhandle = - dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast( + handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ - subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl_rank1 = subhandle; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); KOKKOS_CUSPARSE_SAFE_CALL( cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); KOKKOS_CUSPARSE_SAFE_CALL( cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); - handle->is_set_up = true; } cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; @@ -504,23 +501,22 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; - if (handle->is_set_up) { - subhandle = - dynamic_cast(handle->tpl); + if (handle->tpl_rank2) { + subhandle = dynamic_cast( + handle->tpl_rank2); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ - subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl_rank2 = subhandle; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); KOKKOS_CUSPARSE_SAFE_CALL( cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); KOKKOS_CUSPARSE_SAFE_CALL( cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); - handle->is_set_up = true; } cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; @@ -855,16 +851,16 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, rocsparse_value_type* y_ = reinterpret_cast(y.data()); KokkosSparse::Impl::RocSparse_BSR_SpMV_Data* subhandle; - if (handle->is_set_up) { - subhandle = - dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast( + handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for rocsparse BSR"); subhandle->set_exec_space(exec); } else { - subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); + handle->tpl_rank1 = subhandle; KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( rocsparse_create_mat_descr(&subhandle->mat)); // *_ex* functions deprecated in introduced in 6+ @@ -918,7 +914,6 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, "unsupported value type for rocsparse_*bsrmv"); } #endif - handle->is_set_up = true; } // *_ex* functions deprecated in introduced in 6+ diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 500fbddbe7..c52047ab25 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -186,16 +186,16 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, } KokkosSparse::Impl::CuSparse10_SpMV_Data *subhandle; - if (handle->is_set_up) { - subhandle = - dynamic_cast(handle->tpl); + if (handle->tpl_rank2) { + subhandle = dynamic_cast( + handle->tpl_rank2); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); subhandle->set_exec_space(exec); } else { - subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); + handle->tpl_rank2 = subhandle; /* create matrix */ KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), @@ -209,8 +209,6 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMalloc(&subhandle->buffer, subhandle->bufferSize)); - - handle->is_set_up = true; } KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM(cusparseHandle, opA, opB, &alpha, diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 66ea90c746..c8d25c2c58 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -96,25 +96,38 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType)); - // use default cusparse algo for best performance + // Prior to CUDA 11.2.1, ALG2 was more performant than default for imbalanced + // matrices. After 11.2.1, the default is performant for imbalanced matrices, + // and ALG2 now means something else. CUDA >= 11.2.1 corresponds to + // CUSPARSE_VERSION >= 11402. +#if CUSPARSE_VERSION >= 11402 + const bool useAlg2 = false; +#else + const bool useAlg2 = handle->get_algorithm() == SPMV_MERGE_PATH; +#endif + + // In CUDA 11.2.0, the algorithm enums were renamed. + // This corresponds to CUSPARSE_VERSION >= 11400. #if CUSPARSE_VERSION >= 11400 - cusparseSpMVAlg_t algo = CUSPARSE_SPMV_ALG_DEFAULT; + cusparseSpMVAlg_t algo = + useAlg2 ? CUSPARSE_SPMV_CSR_ALG2 : CUSPARSE_SPMV_ALG_DEFAULT; #else - cusparseSpMVAlg_t algo = CUSPARSE_MV_ALG_DEFAULT; + cusparseSpMVAlg_t algo = + useAlg2 ? CUSPARSE_CSRMV_ALG2 : CUSPARSE_MV_ALG_DEFAULT; #endif KokkosSparse::Impl::CuSparse10_SpMV_Data* subhandle; - if (handle->is_set_up) { - subhandle = - dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast( + handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); subhandle->set_exec_space(exec); } else { - subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); + handle->tpl_rank1 = subhandle; /* create matrix */ KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( @@ -135,7 +148,6 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMalloc(&subhandle->buffer, subhandle->bufferSize)); #endif - handle->is_set_up = true; } /* perform SpMV */ @@ -150,24 +162,23 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; - if (handle->is_set_up) { - subhandle = - dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast( + handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ - subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl_rank1 = subhandle; cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); KOKKOS_CUSPARSE_SAFE_CALL( cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); KOKKOS_CUSPARSE_SAFE_CALL( cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); - handle->is_set_up = true; } /* perform the actual SpMV operation */ @@ -386,16 +397,16 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], rocsparse_spmv_alg alg = rocsparse_spmv_alg_default; KokkosSparse::Impl::RocSparse_CRS_SpMV_Data* subhandle; - if (handle->is_set_up) { - subhandle = - dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast( + handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for rocsparse CRS"); subhandle->set_exec_space(exec); } else { - subhandle = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec); + handle->tpl_rank1 = subhandle; /* Create the rocsparse csr descr */ // We need to do some casting to void* // Note that row_map is always a const view so const_cast is necessary, @@ -443,7 +454,6 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], KOKKOS_IMPL_HIP_SAFE_CALL( hipMalloc(&subhandle->buffer, subhandle->bufferSize)); #endif - handle->is_set_up = true; } /* Perform the actual computation */ @@ -551,8 +561,8 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, Subhandle* subhandle; const MKLScalar* x_mkl = reinterpret_cast(x); MKLScalar* y_mkl = reinterpret_cast(y); - if (handle->is_set_up) { - subhandle = dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast(handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL CRS"); @@ -562,7 +572,7 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, // Use the default execution space instance, as classic MKL does not use // a specific instance. subhandle = new Subhandle(ExecSpace()); - handle->tpl = subhandle; + handle->tpl_rank1 = subhandle; subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; subhandle->descr.mode = SPARSE_FILL_MODE_FULL; subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; @@ -591,7 +601,6 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), Avalues_mkl)); } - handle->is_set_up = true; } MKLScalar alpha_mkl = KokkosToMKLScalar(alpha); MKLScalar beta_mkl = KokkosToMKLScalar(beta); @@ -709,15 +718,15 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle, mkl_mode = oneapi::mkl::transpose::trans; OneMKL_SpMV_Data* subhandle; - if (handle->is_set_up) { - subhandle = dynamic_cast(handle->tpl); + if (handle->tpl_rank1) { + subhandle = dynamic_cast(handle->tpl_rank1); if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for OneMKL CRS"); subhandle->set_exec_space(exec); } else { - subhandle = new OneMKL_SpMV_Data(exec); - handle->tpl = subhandle; + subhandle = new OneMKL_SpMV_Data(exec); + handle->tpl_rank1 = subhandle; oneapi::mkl::sparse::init_matrix_handle(&subhandle->mat); // Even for out-of-order SYCL queue, the inputs here do not depend on // kernels being sequenced @@ -732,7 +741,6 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle, // optimize_gemv has finished oneapi::mkl::sparse::optimize_gemv(exec.sycl_queue(), mkl_mode, subhandle->mat, {ev}); - handle->is_set_up = true; } // Uncommon case: an out-of-order SYCL queue does not promise that previously From 4297e4aca040702ae0c39a96c46f5ad58081684e Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 23 May 2024 07:41:00 -0600 Subject: [PATCH 248/326] c++17: add [[fallthrough]] attribute (#1493) * c++17: add [[fallthrough]] attribute * cm_test_all_sandia: -Wimplicit-fallthrough --- graph/impl/KokkosGraph_Distance2Color_impl.hpp | 2 +- perf_test/sparse/KokkosSparse_sptrsv.cpp | 2 ++ scripts/cm_test_all_sandia | 2 +- sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/graph/impl/KokkosGraph_Distance2Color_impl.hpp index 2ab04667e0..58b6d79ebb 100644 --- a/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -157,7 +157,7 @@ class GraphColorDistance2 { colors_out = color_view_type("Graph Colors", this->nr); } switch (this->gc_handle->get_coloring_algo_type()) { - case COLORING_D2_VB_BIT_EF: using_edge_filtering = true; + case COLORING_D2_VB_BIT_EF: using_edge_filtering = true; [[fallthrough]]; case COLORING_D2_VB_BIT: case COLORING_D2_VB: compute_d2_coloring_vb(colors_out); break; case COLORING_D2_NB_BIT: compute_d2_coloring_nb(colors_out); break; diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp index 35e8b0e16e..2ae5afae50 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp @@ -279,6 +279,7 @@ int test_sptrsv_perf(std::vector tests, const std::string &lfilename, #else std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl; + [[fallthrough]]; #endif default: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, @@ -681,6 +682,7 @@ int test_sptrsv_perf(std::vector tests, const std::string &lfilename, #else std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl; + [[fallthrough]]; #endif default: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index db13665a3b..1bc7692e08 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -702,7 +702,7 @@ elif [ "$MACHINE" = "blake" ]; then ONEAPI_FLAGS_EXTRA="-fp-model=precise" LLVM_EXTRA_FLAGS="-fPIC ${CLANG_WARNING_FLAGS}" # Remove -Wuninitialized: compiler issues show up with Threads backend - GCC11_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered" + GCC11_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wimplicit-fallthrough" # update KOKKOS_PASSTHRU_CMAKE_FLAGS to disable onedpl on Blake KOKKOS_PASSTHRU_CMAKE_FLAGS="${KOKKOS_PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_ONEDPL=OFF" diff --git a/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp b/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp index 51d151e4fb..575ddca578 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp @@ -663,7 +663,7 @@ void KokkosSPGEMM Date: Thu, 23 May 2024 13:51:22 -0600 Subject: [PATCH 249/326] Enable 3 at2 builds (#2210) * .github/mi210: Enable on PRs * .github/mi210: Disable non-tpl build * .github/bdw: Enable PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_REL * .github/h100: Enable PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL --- .github/workflows/bdw.yml | 509 ++++++++++++++++++------------------ .github/workflows/h100.yml | 22 +- .github/workflows/mi210.yml | 181 +++++++------ 3 files changed, 355 insertions(+), 357 deletions(-) diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index f5f4f9700e..381c813bbe 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -1,15 +1,14 @@ name: github-BDW -# Only allow manual runs until at2 runners are available. -on: workflow_dispatch - #pull_request: - # paths-ignore: - # - '**/*.rst' - # - '**/*.md' - # - '**/requirements.txt' - # - '**/*.py' - # - 'docs/**' - # types: [ opened, reopened, synchronize ] +on: + pull_request: + paths-ignore: + - '**/*.rst' + - '**/*.md' + - '**/requirements.txt' + - '**/*.py' + - 'docs/**' + types: [ opened, reopened, synchronize ] permissions: contents: none @@ -20,168 +19,168 @@ concurrency: cancel-in-progress: true jobs: - PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI: - name: PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI - runs-on: [kk-env-gcc-10.2.0-latest] - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v3 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v3 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build,install} - cd kokkos/build - cmake \ - -DCMAKE_CXX_COMPILER=g++ \ - -DCMAKE_CXX_FLAGS=-O3 \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ARCH_BDW=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j12 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install} - cd kokkos-kernels/build - cmake \ - -DCMAKE_CXX_COMPILER=g++ \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 " \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DCMAKE_EXE_LINKER_FLAGS="" \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_TEST_ETI_ONLY=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build_kokkos_kernels - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 - - PR_BDW_GNU1020_THREADS_SERIAL_RIGHT_REL: - name: PR_BDW_GNU1020_THREADS_SERIAL_RIGHT_REL - runs-on: [kk-env-gcc-10.2.0-latest] - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v3 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v3 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build,install} - cd kokkos/build - cmake \ - -DCMAKE_CXX_COMPILER=g++ \ - -DCMAKE_CXX_FLAGS=-O3 \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_SERIAL=ON \ - -DKokkos_ENABLE_THREADS=ON \ - -DKokkos_ARCH_BDW=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j12 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install} - cd kokkos-kernels/build - cmake \ - -DCMAKE_CXX_COMPILER=g++ \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 " \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DCMAKE_EXE_LINKER_FLAGS="" \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build_kokkos_kernels - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 +# PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI: +# name: PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI +# runs-on: [kk-env-gcc-10.2.0-latest] +# +# steps: +# - name: checkout_kokkos_kernels +# uses: actions/checkout@v3 +# with: +# path: kokkos-kernels +# +# - name: checkout_kokkos +# uses: actions/checkout@v3 +# with: +# repository: kokkos/kokkos +# ref: ${{ github.base_ref }} +# path: kokkos +# +# - name: configure_kokkos +# run: | +# mkdir -p kokkos/{build,install} +# cd kokkos/build +# cmake \ +# -DCMAKE_CXX_COMPILER=g++ \ +# -DCMAKE_CXX_FLAGS=-O3 \ +# -DCMAKE_EXE_LINKER_FLAGS= \ +# -DCMAKE_INSTALL_PREFIX=$PWD/../install \ +# -DKokkos_ENABLE_OPENMP=ON \ +# -DKokkos_ARCH_BDW=ON \ +# -DKokkos_ENABLE_TESTS=OFF \ +# -DKokkos_ENABLE_EXAMPLES=OFF \ +# -DCMAKE_VERBOSE_MAKEFILE=ON \ +# -DCMAKE_CXX_EXTENSIONS=OFF \ +# -DCMAKE_CXX_STANDARD=17 \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +# -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ +# .. +# +# - name: build_and_install_kokkos +# working-directory: kokkos/build +# run: make -j12 install +# +# - name: configure_kokkos_kernels +# run: | +# mkdir -p kokkos-kernels/{build,install} +# cd kokkos-kernels/build +# cmake \ +# -DCMAKE_CXX_COMPILER=g++ \ +# -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ +# -DCMAKE_CXX_FLAGS="-O3 " \ +# -DCMAKE_INSTALL_PREFIX= \ +# -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ +# -DKokkosKernels_ENABLE_TESTS=ON \ +# -DKokkosKernels_ENABLE_PERFTESTS=ON \ +# -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ +# -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ +# -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ +# -DKokkosKernels_INST_DOUBLE=ON \ +# -DKokkosKernels_INST_ORDINAL_INT=ON \ +# -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ +# -DKokkosKernels_INST_OFFSET_INT=ON \ +# -DKokkosKernels_INST_LAYOUTLEFT=ON \ +# -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ +# -DCMAKE_EXE_LINKER_FLAGS="" \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkosKernels_TEST_ETI_ONLY=OFF \ +# -DKokkosKernels_ENABLE_DOCS=OFF \ +# .. +# +# - name: build_kokkos_kernels +# working-directory: kokkos-kernels/build +# run: make -j12 all +# +# - name: test +# working-directory: kokkos-kernels/build +# run: ctest --output-on-failure -V --timeout 3600 +# +# PR_BDW_GNU1020_THREADS_SERIAL_RIGHT_REL: +# name: PR_BDW_GNU1020_THREADS_SERIAL_RIGHT_REL +# runs-on: [kk-env-gcc-10.2.0-latest] +# +# steps: +# - name: checkout_kokkos_kernels +# uses: actions/checkout@v3 +# with: +# path: kokkos-kernels +# +# - name: checkout_kokkos +# uses: actions/checkout@v3 +# with: +# repository: kokkos/kokkos +# ref: ${{ github.base_ref }} +# path: kokkos +# +# - name: configure_kokkos +# run: | +# mkdir -p kokkos/{build,install} +# cd kokkos/build +# cmake \ +# -DCMAKE_CXX_COMPILER=g++ \ +# -DCMAKE_CXX_FLAGS=-O3 \ +# -DCMAKE_EXE_LINKER_FLAGS= \ +# -DCMAKE_INSTALL_PREFIX=$PWD/../install \ +# -DKokkos_ENABLE_SERIAL=ON \ +# -DKokkos_ENABLE_THREADS=ON \ +# -DKokkos_ARCH_BDW=ON \ +# -DKokkos_ENABLE_TESTS=OFF \ +# -DKokkos_ENABLE_EXAMPLES=OFF \ +# -DCMAKE_VERBOSE_MAKEFILE=ON \ +# -DCMAKE_CXX_EXTENSIONS=OFF \ +# -DCMAKE_CXX_STANDARD=17 \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +# -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ +# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +# .. +# +# - name: build_and_install_kokkos +# working-directory: kokkos/build +# run: make -j12 install +# +# - name: configure_kokkos_kernels +# run: | +# mkdir -p kokkos-kernels/{build,install} +# cd kokkos-kernels/build +# cmake \ +# -DCMAKE_CXX_COMPILER=g++ \ +# -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ +# -DCMAKE_CXX_FLAGS="-O3 " \ +# -DCMAKE_INSTALL_PREFIX= \ +# -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ +# -DKokkosKernels_ENABLE_TESTS=ON \ +# -DKokkosKernels_ENABLE_PERFTESTS=ON \ +# -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ +# -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ +# -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ +# -DKokkosKernels_INST_DOUBLE=ON \ +# -DKokkosKernels_INST_ORDINAL_INT=ON \ +# -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ +# -DKokkosKernels_INST_OFFSET_INT=ON \ +# -DKokkosKernels_INST_LAYOUTLEFT=ON \ +# -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ +# -DCMAKE_EXE_LINKER_FLAGS="" \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkosKernels_ENABLE_DOCS=OFF \ +# .. +# +# - name: build_kokkos_kernels +# working-directory: kokkos-kernels/build +# run: make -j12 all +# +# - name: test +# working-directory: kokkos-kernels/build +# run: ctest --output-on-failure -V --timeout 3600 PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_REL: name: PR_BDW_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_REL @@ -268,86 +267,86 @@ jobs: working-directory: kokkos-kernels/build run: ctest --output-on-failure -V --timeout 3600 - PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL: - name: PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL - runs-on: [kk-env-llvm-10.0.1-latest] - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v3 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v3 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build,install} - cd kokkos/build - cmake \ - -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_SERIAL=ON \ - -DKokkos_ENABLE_THREADS=ON \ - -DKokkos_ARCH_BDW=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j12 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install} - cd kokkos-kernels/build - cmake \ - -DCMAKE_CXX_COMPILER=clang++ \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DCMAKE_EXE_LINKER_FLAGS="" \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build_kokkos_kernels - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 - - \ No newline at end of file +# PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL: +# name: PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL +# runs-on: [kk-env-llvm-10.0.1-latest] +# +# steps: +# - name: checkout_kokkos_kernels +# uses: actions/checkout@v3 +# with: +# path: kokkos-kernels +# +# - name: checkout_kokkos +# uses: actions/checkout@v3 +# with: +# repository: kokkos/kokkos +# ref: ${{ github.base_ref }} +# path: kokkos +# +# - name: configure_kokkos +# run: | +# mkdir -p kokkos/{build,install} +# cd kokkos/build +# cmake \ +# -DCMAKE_CXX_COMPILER=clang++ \ +# -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ +# -DCMAKE_EXE_LINKER_FLAGS= \ +# -DCMAKE_INSTALL_PREFIX=$PWD/../install \ +# -DKokkos_ENABLE_SERIAL=ON \ +# -DKokkos_ENABLE_THREADS=ON \ +# -DKokkos_ARCH_BDW=ON \ +# -DKokkos_ENABLE_TESTS=OFF \ +# -DKokkos_ENABLE_EXAMPLES=OFF \ +# -DCMAKE_VERBOSE_MAKEFILE=ON \ +# -DCMAKE_CXX_EXTENSIONS=OFF \ +# -DCMAKE_CXX_STANDARD=17 \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +# -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ +# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +# .. +# +# - name: build_and_install_kokkos +# working-directory: kokkos/build +# run: make -j12 install +# +# - name: configure_kokkos_kernels +# run: | +# mkdir -p kokkos-kernels/{build,install} +# cd kokkos-kernels/build +# cmake \ +# -DCMAKE_CXX_COMPILER=clang++ \ +# -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ +# -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ +# -DCMAKE_INSTALL_PREFIX= \ +# -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ +# -DKokkosKernels_ENABLE_TESTS=ON \ +# -DKokkosKernels_ENABLE_PERFTESTS=ON \ +# -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ +# -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ +# -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ +# -DKokkosKernels_INST_DOUBLE=ON \ +# -DKokkosKernels_INST_ORDINAL_INT=ON \ +# -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ +# -DKokkosKernels_INST_OFFSET_INT=ON \ +# -DKokkosKernels_INST_LAYOUTLEFT=ON \ +# -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ +# -DCMAKE_EXE_LINKER_FLAGS="" \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkosKernels_ENABLE_DOCS=OFF \ +# .. +# +# - name: build_kokkos_kernels +# working-directory: kokkos-kernels/build +# run: make -j12 all +# +# - name: test +# working-directory: kokkos-kernels/build +# run: ctest --output-on-failure -V --timeout 3600 +# +# \ No newline at end of file diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml index 279b6089fa..8ba4eb308a 100644 --- a/.github/workflows/h100.yml +++ b/.github/workflows/h100.yml @@ -1,15 +1,15 @@ name: github-H100 # Only allow manual runs until at2 runners are available. -on: workflow_dispatch - #pull_request: - # paths-ignore: - # - '**/*.rst' - # - '**/*.md' - # - '**/requirements.txt' - # - '**/*.py' - # - 'docs/**' - # types: [ opened, reopened, synchronize ] +on: + pull_request: + paths-ignore: + - '**/*.rst' + - '**/*.md' + - '**/requirements.txt' + - '**/*.py' + - 'docs/**' + types: [ opened, reopened, synchronize ] permissions: contents: none @@ -20,8 +20,8 @@ concurrency: cancel-in-progress: true jobs: - PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL_2: - name: PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL_2 + PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL: + name: PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL runs-on: [kk-env-cuda-11.8.0-gcc-11.3.0-latest] steps: diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index 7f4518b244..fa20b49c3b 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -1,15 +1,14 @@ name: github-MI210 -# Only allow manual runs until at2 runners are available. -on: workflow_dispatch - #pull_request: - # paths-ignore: - # - '**/*.rst' - # - '**/*.md' - # - '**/requirements.txt' - # - '**/*.py' - # - 'docs/**' - # types: [ opened, reopened, synchronize ] +on: + pull_request: + paths-ignore: + - '**/*.rst' + - '**/*.md' + - '**/requirements.txt' + - '**/*.py' + - 'docs/**' + types: [ opened, reopened, synchronize ] permissions: contents: none @@ -20,87 +19,87 @@ concurrency: cancel-in-progress: true jobs: - PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL: - name: PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL - runs-on: [kk-env-hip-5.6.1-latest] - - steps: - - name: checkout_kokkos_kernels - uses: actions/checkout@v4 - with: - path: kokkos-kernels - - - name: checkout_kokkos - uses: actions/checkout@v4 - with: - repository: kokkos/kokkos - ref: ${{ github.base_ref }} - path: kokkos - - - name: configure_kokkos - run: | - mkdir -p kokkos/{build,install} - cd kokkos/build - HIPCC=$(which hipcc) - cmake -DCMAKE_CXX_COMPILER=$HIPCC \ - -DCMAKE_CXX_FLAGS=-O3 \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_SERIAL=ON \ - -DKokkos_ENABLE_HIP=ON \ - -DKokkos_ARCH_VEGA90A=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - .. - - - name: build_and_install_kokkos - working-directory: kokkos/build - run: make -j16 install - - - name: configure_kokkos_kernels - run: | - mkdir -p kokkos-kernels/{build,install} - cd kokkos-kernels/build - HIPCC=$(which hipcc) - cmake -DCMAKE_CXX_COMPILER=$HIPCC \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 " \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DCMAKE_EXE_LINKER_FLAGS="" \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. - - - name: build - working-directory: kokkos-kernels/build - run: make -j12 all - - - name: test - working-directory: kokkos-kernels/build - run: ctest --output-on-failure -V --timeout 3600 +# PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL: +# name: PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL +# runs-on: [kk-env-hip-5.6.1-latest] +# +# steps: +# - name: checkout_kokkos_kernels +# uses: actions/checkout@v4 +# with: +# path: kokkos-kernels +# +# - name: checkout_kokkos +# uses: actions/checkout@v4 +# with: +# repository: kokkos/kokkos +# ref: ${{ github.base_ref }} +# path: kokkos +# +# - name: configure_kokkos +# run: | +# mkdir -p kokkos/{build,install} +# cd kokkos/build +# HIPCC=$(which hipcc) +# cmake -DCMAKE_CXX_COMPILER=$HIPCC \ +# -DCMAKE_CXX_FLAGS=-O3 \ +# -DCMAKE_EXE_LINKER_FLAGS= \ +# -DCMAKE_INSTALL_PREFIX=$PWD/../install \ +# -DKokkos_ENABLE_SERIAL=ON \ +# -DKokkos_ENABLE_HIP=ON \ +# -DKokkos_ARCH_VEGA90A=ON \ +# -DKokkos_ENABLE_TESTS=OFF \ +# -DKokkos_ENABLE_EXAMPLES=OFF \ +# -DCMAKE_VERBOSE_MAKEFILE=ON \ +# -DCMAKE_CXX_EXTENSIONS=OFF \ +# -DCMAKE_CXX_STANDARD=17 \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +# -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ +# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +# .. +# +# - name: build_and_install_kokkos +# working-directory: kokkos/build +# run: make -j16 install +# +# - name: configure_kokkos_kernels +# run: | +# mkdir -p kokkos-kernels/{build,install} +# cd kokkos-kernels/build +# HIPCC=$(which hipcc) +# cmake -DCMAKE_CXX_COMPILER=$HIPCC \ +# -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ +# -DCMAKE_CXX_FLAGS="-O3 " \ +# -DCMAKE_INSTALL_PREFIX= \ +# -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ +# -DKokkosKernels_ENABLE_TESTS=ON \ +# -DKokkosKernels_ENABLE_PERFTESTS=ON \ +# -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ +# -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ +# -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ +# -DKokkosKernels_INST_DOUBLE=ON \ +# -DKokkosKernels_INST_ORDINAL_INT=ON \ +# -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ +# -DKokkosKernels_INST_OFFSET_INT=ON \ +# -DKokkosKernels_INST_LAYOUTLEFT=ON \ +# -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ +# -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ +# -DCMAKE_EXE_LINKER_FLAGS="" \ +# -DBUILD_SHARED_LIBS=OFF \ +# -DKokkosKernels_ENABLE_DOCS=OFF \ +# .. +# +# - name: build +# working-directory: kokkos-kernels/build +# run: make -j12 all +# +# - name: test +# working-directory: kokkos-kernels/build +# run: ctest --output-on-failure -V --timeout 3600 PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_REL: name: PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_REL From f445b23603e33388107018b8b1ebb76f0ff3377d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 May 2024 16:00:46 -0600 Subject: [PATCH 250/326] Bump ossf/scorecard-action from 2.0.6 to 2.3.3 (#2214) Bumps [ossf/scorecard-action](https://github.com/ossf/scorecard-action) from 2.0.6 to 2.3.3. - [Release notes](https://github.com/ossf/scorecard-action/releases) - [Changelog](https://github.com/ossf/scorecard-action/blob/main/RELEASE.md) - [Commits](https://github.com/ossf/scorecard-action/compare/99c53751e09b9529366343771cc321ec74e9bd3d...dc50aa9510b46c811795eb24b2f1ba02a914e534) --- updated-dependencies: - dependency-name: ossf/scorecard-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 7ee2c9c2cf..dd6835b3ff 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -43,7 +43,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6 + uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3 with: results_file: results.sarif results_format: sarif From 90d5f89f178068e6a021cbaf3642ece9c790178d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 May 2024 16:02:23 -0600 Subject: [PATCH 251/326] Bump actions/upload-artifact from 3.1.3 to 4.3.3 (#2215) Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3.1.3 to 4.3.3. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/a8a3f3ad30e3422c9c7b888a15615d19a852ae32...65462800fd760344b1a7b4382951275a0abb4808) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index dd6835b3ff..4e02538f72 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -65,7 +65,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 with: name: SARIF file path: results.sarif From 9e419a83acea05aef5c7583d023491ac0f2770b6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 May 2024 16:03:02 -0600 Subject: [PATCH 252/326] Bump actions/dependency-review-action from 2.5.1 to 4.3.2 (#2217) Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 2.5.1 to 4.3.2. - [Release notes](https://github.com/actions/dependency-review-action/releases) - [Commits](https://github.com/actions/dependency-review-action/compare/0efb1d1d84fc9633afcdaad14c485cbbc90ef46c...0c155c5e8556a497adf53f2c18edabf945ed8e70) --- updated-dependencies: - dependency-name: actions/dependency-review-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dependency-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index d084a0f10a..bdf3d201a2 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -24,4 +24,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 - name: 'Dependency Review' - uses: actions/dependency-review-action@0efb1d1d84fc9633afcdaad14c485cbbc90ef46c # v2.5.1 + uses: actions/dependency-review-action@0c155c5e8556a497adf53f2c18edabf945ed8e70 # v4.3.2 From 456d65278b39ec723d4d190ba2c46f7ee060864a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 May 2024 16:03:36 -0600 Subject: [PATCH 253/326] Bump step-security/harden-runner from 2.7.1 to 2.8.0 (#2218) Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.7.1 to 2.8.0. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/a4aa98b93cab29d9b1101a6143fb8bce00e2eac4...f086349bfa2bd1361f7909c78558e816508cdc10) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 2 +- .github/workflows/dependency-review.yml | 2 +- .github/workflows/scorecards.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 079acb93bf..9c22cf3639 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 with: egress-policy: audit diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index bdf3d201a2..88b6efd632 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 with: egress-policy: audit diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 4e02538f72..637a464406 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 with: egress-policy: audit From 726375a4916ec4d63404d9f7fb5376f31b7d299d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 07:04:02 -0600 Subject: [PATCH 254/326] Bump github/codeql-action from 2.25.3 to 3.25.6 (#2216) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 2.25.3 to 3.25.6. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/ceaec5c11a131e0d282ff3b6f095917d234caace...9fdb3e49720b44c48891d036bb502feb25684276) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 9c22cf3639..7e72e991f5 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,7 +44,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@ceaec5c11a131e0d282ff3b6f095917d234caace # v2.25.3 + uses: github/codeql-action/init@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@ceaec5c11a131e0d282ff3b6f095917d234caace # v2.25.3 + uses: github/codeql-action/analyze@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: category: "/language:c-cpp" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 637a464406..ba04204377 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@ceaec5c11a131e0d282ff3b6f095917d234caace # v2.25.3 + uses: github/codeql-action/upload-sarif@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: sarif_file: results.sarif From ee704333e9a3355ae53c184528f6f66a381394f2 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 29 May 2024 23:38:07 +0200 Subject: [PATCH 255/326] Kokkos Kernels: initial security policy (#2220) We state were patches will be made and how to report security vulnerabilities to the project. --- .github/SECURITY.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/SECURITY.md diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 0000000000..e352c31765 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1,14 @@ +# Security Policy + +## Supported Versions + +Kokkos Kernels will issue vulnerability fixes in the develop branch of the project repository. +Typically a patch release will be issued shortly after a vulnerability is detected and fixed. +The project does not typically issue patch releases for older versions and if it happens, announcements will be made on our website and/or slack channel. + +## Reporting a Vulnerability + +To report a security issue, please email lberge@sandia.gov and srajama@sandia.gov with a description of the issue, the steps you took to create the issue, affected versions, and, if known, mitigations for the issue. +Our vulnerability management team will respond within 5 working days of your email. +If the issue is confirmed as a vulnerability, we will open a Security Advisory. +This project follows a 90 day disclosure timeline. From 8dfcc195e5c720e65fb6a6b4aa2e24a8d1f6f2ab Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 30 May 2024 00:14:15 +0200 Subject: [PATCH 256/326] Kokkos Kernels: adding SHA for github actions (#2221) * Kokkos Kernels: adding SHA for github actions * Adding sha for docker images --- .github/workflows/bdw.yml | 4 ++-- .github/workflows/docs.yml | 4 ++-- .github/workflows/format.yml | 2 +- .github/workflows/h100.yml | 4 ++-- .github/workflows/mi210.yml | 4 ++-- .github/workflows/osx.yml | 4 ++-- .github/workflows/spr.yml | 4 ++-- scripts/docker/Dockerfile.hip | 2 +- scripts/docker/Dockerfile.sycl | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index 381c813bbe..12d74cb8e5 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -188,12 +188,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v3 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v3 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b2bf72d2ef..ff5120ebe9 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,12 +25,12 @@ jobs: doxygen --version - name: checkout_kokkos_kernels - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 6e2db4031a..b428de69db 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 - name: Install Dependencies run: sudo apt install clang-format-8 diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml index 8ba4eb308a..ebf1e2ac39 100644 --- a/.github/workflows/h100.yml +++ b/.github/workflows/h100.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v3 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v3 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index fa20b49c3b..ef0f15c5d4 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -107,12 +107,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 9f05579fa5..89ce49d535 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -50,12 +50,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml index efb585ef02..0994f36786 100644 --- a/.github/workflows/spr.yml +++ b/.github/workflows/spr.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v4 + uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/scripts/docker/Dockerfile.hip b/scripts/docker/Dockerfile.hip index 61305b2159..ab3edd7503 100644 --- a/scripts/docker/Dockerfile.hip +++ b/scripts/docker/Dockerfile.hip @@ -1,4 +1,4 @@ -ARG BASE=rocm/dev-ubuntu-20.04:5.2 +ARG BASE=rocm/dev-ubuntu-20.04@sha256:f345894ae2d64c0d4676070301b0368599c504462bcb5934aa40f65357078c85 # 5.2 FROM $BASE RUN apt-get update && apt-get install -y \ diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 714461bfe6..4dbd69dcba 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:11.7.1-devel-ubuntu22.04 +ARG BASE=nvidia/cuda@sha256:38e59267704b5d91ef63c7d8f613359c629fab0aead1283d59ca7821029e73bc # 11.7.1-devel-ubuntu22.04 FROM $BASE RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub From 8abbc41e8ed0022cfdf50477c8629707e54040a2 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 30 May 2024 15:39:44 +0200 Subject: [PATCH 257/326] More dependencies fix (#2222) * workflows: fix sha for label checker docker image * workflows: fixing Docker files dependencies. --- .github/workflows/osx.yml | 2 +- scripts/docker/Dockerfile.hip | 3 +-- scripts/docker/Dockerfile.sycl | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 89ce49d535..d69b4a2c05 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -22,7 +22,7 @@ jobs: check-pr-labels: runs-on: [ubuntu-latest] steps: - - uses: docker://agilepathway/pull-request-label-checker:latest + - uses: docker://agilepathway/pull-request-label-checker@sha256:ee57b0e1aedab22063ce6467a6e6358e254a9204693ca20d8a16b2d891db8d5f # v1.6.32 with: none_of: 'AT: WIP' repo_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/scripts/docker/Dockerfile.hip b/scripts/docker/Dockerfile.hip index ab3edd7503..83d2c99f53 100644 --- a/scripts/docker/Dockerfile.hip +++ b/scripts/docker/Dockerfile.hip @@ -1,5 +1,4 @@ -ARG BASE=rocm/dev-ubuntu-20.04@sha256:f345894ae2d64c0d4676070301b0368599c504462bcb5934aa40f65357078c85 # 5.2 -FROM $BASE +FROM rocm/dev-ubuntu-20.04@sha256:f345894ae2d64c0d4676070301b0368599c504462bcb5934aa40f65357078c85 AS BASE # rocm/dev-ubuntu-20.04:5.2 RUN apt-get update && apt-get install -y \ git \ diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 4dbd69dcba..62f91404cd 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,5 +1,4 @@ -ARG BASE=nvidia/cuda@sha256:38e59267704b5d91ef63c7d8f613359c629fab0aead1283d59ca7821029e73bc # 11.7.1-devel-ubuntu22.04 -FROM $BASE +FROM nvidia/cuda@sha256:38e59267704b5d91ef63c7d8f613359c629fab0aead1283d59ca7821029e73bc AS BASE # nvidia/cuda:11.7.1-devel-ubuntu22.04 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub From 38132f350a7a76dd9436873de303f18255bf6894 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 30 May 2024 16:33:35 +0200 Subject: [PATCH 258/326] Fix Docker files (#2223) --- scripts/docker/Dockerfile.hip | 2 +- scripts/docker/Dockerfile.sycl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/docker/Dockerfile.hip b/scripts/docker/Dockerfile.hip index 83d2c99f53..1c4ae9ea98 100644 --- a/scripts/docker/Dockerfile.hip +++ b/scripts/docker/Dockerfile.hip @@ -1,4 +1,4 @@ -FROM rocm/dev-ubuntu-20.04@sha256:f345894ae2d64c0d4676070301b0368599c504462bcb5934aa40f65357078c85 AS BASE # rocm/dev-ubuntu-20.04:5.2 +FROM rocm/dev-ubuntu-20.04@sha256:f345894ae2d64c0d4676070301b0368599c504462bcb5934aa40f65357078c85 RUN apt-get update && apt-get install -y \ git \ diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 62f91404cd..ec65df3313 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,4 +1,4 @@ -FROM nvidia/cuda@sha256:38e59267704b5d91ef63c7d8f613359c629fab0aead1283d59ca7821029e73bc AS BASE # nvidia/cuda:11.7.1-devel-ubuntu22.04 +FROM nvidia/cuda@sha256:38e59267704b5d91ef63c7d8f613359c629fab0aead1283d59ca7821029e73bc RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub From f352a0371eacec674258589989e724373a070c04 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Fri, 31 May 2024 11:00:06 -0600 Subject: [PATCH 259/326] SPMV TPLs: improve profile region labels (#2219) - Mark rank-2 versions as "spmv_mv" not "spmv" (the native impl has this, and it's useful to know which one is being run) - Add missing commas separating "BSRMATRIX" and the scalar type name --- .../tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 10 +++++----- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 3564fa68fd..7096eb4131 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -227,7 +227,7 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, const AMatrix& A, const XVector& X, \ const coefficient_type& beta, \ const YVector& Y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_bsr_mkl(handle, mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ @@ -292,7 +292,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP) const AMatrix& A, const XVector& X, \ const coefficient_type& beta, \ const YVector& Y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + std::string label = "KokkosSparse::spmv_mv[TPL_MKL,BSRMATRIX," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ MKL_INT colx = static_cast(X.extent(1)); \ @@ -610,7 +610,7 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_bsr_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ @@ -695,7 +695,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + std::string label = "KokkosSparse::spmv_mv[TPL_CUSPARSE,BSRMATRIX," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_mv_bsr_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ @@ -1029,7 +1029,7 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX" + \ + std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_bsr_rocsparse(exec, handle, mode, alpha, A, x, beta, y); \ diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index c52047ab25..eee1812a06 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -253,7 +253,7 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, const char mode[], const coefficient_type &alpha, \ const AMatrix &A, const XVector &x, \ const coefficient_type &beta, const YVector &y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + std::string label = "KokkosSparse::spmv_mv[TPL_CUSPARSE," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_mv_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ From d2214e11abab9aaca7430bf1b30bdc16997e9661 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 31 May 2024 16:23:06 -0700 Subject: [PATCH 260/326] cusparse spgemm: provide non-null row-ptr (#2213) --- .../KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index a718769c7b..70f1079e7b 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -87,9 +87,20 @@ void spgemm_symbolic_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType)); +#if CUDA_VERSION >= 12020 + // at some point cusparseCreateCsr started to need a non-null row-pointer + // array, even if the operation that consumed the handle doesn't need to + // read it. This was observed on a system with CUDA 12.2, but it may have + // started earlier. + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &h->descr_C, m, k, 0, (void *)row_mapC.data(), nullptr, nullptr, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + h->scalarType)); +#else KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( &h->descr_C, m, k, 0, nullptr, nullptr, nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType)); +#endif //---------------------------------------------------------------------- // ask bufferSize1 bytes for external memory From 51906f12a9cf97e73f9150c79c0cb0f744fe38f4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Jun 2024 12:55:47 -0600 Subject: [PATCH 261/326] Bump github/codeql-action from 3.25.6 to 3.25.7 (#2225) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.6 to 3.25.7. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/9fdb3e49720b44c48891d036bb502feb25684276...f079b8493333aace61c81488f8bd40919487bd9f) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 7e72e991f5..fbcda17691 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,7 +44,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 + uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 + uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 with: category: "/language:c-cpp" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index ba04204377..af518b9a5a 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 + uses: github/codeql-action/upload-sarif@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 with: sarif_file: results.sarif From 32aa75a8f20ca88df64bde421c335b9fa6f68397 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Jun 2024 12:56:08 -0600 Subject: [PATCH 262/326] Bump actions/checkout from 3.6.0 to 4.1.6 (#2226) Bumps [actions/checkout](https://github.com/actions/checkout) from 3.6.0 to 4.1.6. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/f43a0e5ff2bd294095638e18286ca9a3d1956744...a5ac7e51b41094c92402da3b24376905380afc29) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/bdw.yml | 4 ++-- .github/workflows/codeql.yml | 4 ++-- .github/workflows/dependency-review.yml | 2 +- .github/workflows/docs.yml | 4 ++-- .github/workflows/format.yml | 2 +- .github/workflows/h100.yml | 4 ++-- .github/workflows/mi210.yml | 4 ++-- .github/workflows/osx.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- .github/workflows/spr.yml | 4 ++-- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index 12d74cb8e5..f60008ab72 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -188,12 +188,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index fbcda17691..20b6a634d3 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -38,7 +38,7 @@ jobs: egress-policy: audit - name: checkout_kokkos_kernels - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: path: kokkos-kernels @@ -52,7 +52,7 @@ jobs: # Prefix the list here with "+" to use these queries and those in the config file. - name: checkout_kokkos - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: repository: 'kokkos/kokkos' path: 'kokkos' diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 88b6efd632..7141d09337 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -22,6 +22,6 @@ jobs: egress-policy: audit - name: 'Checkout Repository' - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - name: 'Dependency Review' uses: actions/dependency-review-action@0c155c5e8556a497adf53f2c18edabf945ed8e70 # v4.3.2 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index ff5120ebe9..901e218fdc 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,12 +25,12 @@ jobs: doxygen --version - name: checkout_kokkos_kernels - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index b428de69db..5517b68dbb 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - name: Install Dependencies run: sudo apt install clang-format-8 diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml index ebf1e2ac39..5fd01d972b 100644 --- a/.github/workflows/h100.yml +++ b/.github/workflows/h100.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index ef0f15c5d4..7b55f065bf 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -107,12 +107,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index d69b4a2c05..082467d614 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -50,12 +50,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index af518b9a5a..4f4f03469a 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -38,7 +38,7 @@ jobs: egress-policy: audit - name: "Checkout code" - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: persist-credentials: false diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml index 0994f36786..8fe8053f5b 100644 --- a/.github/workflows/spr.yml +++ b/.github/workflows/spr.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v4.1.6 + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} From a7d02ca9bc45edd6f96ce7a762c2540ea3c1d323 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 5 Jun 2024 15:20:53 -0600 Subject: [PATCH 263/326] README: adding link to scorecard (#2230) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bdad1442ce..173e3e3333 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/kokkos/kokkos-kernels/badge)](https://securityscorecards.dev/viewer/?uri=github.com/kokkos/kokkos-kernels) [![Generic badge](https://readthedocs.org/projects/kokkos-kernels/badge/?version=latest)](https://kokkos-kernels.readthedocs.io/en/latest/) ![KokkosKernels](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) From a955b8b6d6bdcde7ef8dbf5a98cac481bec4587d Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Fri, 7 Jun 2024 09:38:24 -0600 Subject: [PATCH 264/326] spmv_mv wrappers for rocsparse (#2233) * spmv_mv wrappers for rocsparse (rocsparse_spmm()) * Use consistent types for alpha/beta in spmv wrappers --- sparse/src/KokkosSparse_spmv.hpp | 54 +++-- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 12 +- .../KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 45 ++++ .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 203 +++++++++++++++++- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 14 +- 5 files changed, 283 insertions(+), 45 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 336bae4f1d..ddbef56504 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -263,10 +263,7 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], ///////////////// #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the conjugate mode (C) - if constexpr (std::is_same_v || - std::is_same_v) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] == Conjugate[0]); } // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for @@ -278,20 +275,19 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - if (std::is_same::value) { + if (std::is_same_v) { useNative = useNative || (mode[0] != NoTranspose[0]); } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same_v) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] == Conjugate[0]); } #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] == Conjugate[0]); } #endif @@ -324,7 +320,14 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], // CRS, rank 2 // ///////////////// #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - useNative = useNative || (Conjugate[0] == mode[0]); + if constexpr (std::is_same_v) { + useNative = useNative || (Conjugate[0] == mode[0]); + } +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if constexpr (std::is_same_v) { + useNative = useNative || (Conjugate[0] == mode[0]); + } #endif if (useNative) { @@ -355,25 +358,21 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], ///////////////// #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the modes (C), (T), (H) - if (std::is_same::value || - std::is_same::value) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] != NoTranspose[0]); } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] == Conjugate[0]); } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE // rocSparse does not support the modes (C), (T), (H) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] != NoTranspose[0]); } #endif @@ -403,17 +402,14 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], ///////////////// #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the modes (C), (T), (H) - if (std::is_same::value || - std::is_same::value) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] != NoTranspose[0]); } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { + if constexpr (std::is_same_v) { useNative = useNative || (mode[0] == Conjugate[0]); } #endif @@ -593,8 +589,8 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], "KokkosSparse::spmv_struct: Both Vector inputs must have rank 1 in " "order to call this specialization of spmv."); // Make sure that y is non-const. - static_assert(std::is_same::value, + static_assert(std::is_same_v, "KokkosSparse::spmv_struct: Output Vector must be non-const."); // Check compatibility of dimensions at run time. @@ -886,8 +882,8 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], static_assert(XVector::rank == YVector::rank, "KokkosSparse::spmv: Vector ranks do not match."); // Make sure that y is non-const. - static_assert(std::is_same::value, + static_assert(std::is_same_v, "KokkosSparse::spmv: Output Vector must be non-const."); // Check compatibility of dimensions at run time. diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 7096eb4131..3b574c8866 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -348,9 +348,9 @@ namespace Impl { template void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], - typename YVector::non_const_value_type const& alpha, + typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, - typename YVector::non_const_value_type const& beta, + typename YVector::const_value_type& beta, const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using entry_type = typename AMatrix::non_const_ordinal_type; @@ -463,9 +463,9 @@ void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, template void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], - typename YVector::non_const_value_type const& alpha, + typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, - typename YVector::non_const_value_type const& beta, + typename YVector::const_value_type& beta, const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using entry_type = typename AMatrix::non_const_ordinal_type; @@ -751,9 +751,9 @@ namespace Impl { template void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], - typename YVector::non_const_value_type const& alpha, + typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, - typename YVector::non_const_value_type const& beta, + typename YVector::const_value_type& beta, const YVector& y) { /* rocm 5.4.0 rocsparse_*bsrmv reference: diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 44a8098ca3..2e2ee561a1 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -155,6 +155,51 @@ KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, #endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) #endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, XL, YL, \ + MEMSPACE) \ + template <> \ + struct spmv_mv_tpl_spec_avail< \ + Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const rocsparse_int>, \ + Kokkos::View< \ + const SCALAR**, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +#define AVAIL_ROCSPARSE_SCALAR_MEMSPACE(SCALAR, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, Kokkos::LayoutLeft, \ + Kokkos::LayoutLeft, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, Kokkos::LayoutLeft, \ + Kokkos::LayoutRight, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, Kokkos::LayoutRight, \ + Kokkos::LayoutLeft, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, Kokkos::LayoutRight, \ + Kokkos::LayoutRight, MEMSPACE) + +#define AVAIL_ROCSPARSE_SCALAR(SCALAR) \ + AVAIL_ROCSPARSE_SCALAR_MEMSPACE(SCALAR, Kokkos::HIPSpace) \ + AVAIL_ROCSPARSE_SCALAR_MEMSPACE(SCALAR, Kokkos::HIPManagedSpace) + +AVAIL_ROCSPARSE_SCALAR(float) +AVAIL_ROCSPARSE_SCALAR(double) +AVAIL_ROCSPARSE_SCALAR(Kokkos::complex) +AVAIL_ROCSPARSE_SCALAR(Kokkos::complex) + +#undef AVAIL_ROCSPARSE_SCALAR_MEMSPACE +#undef AVAIL_ROCSPARSE_SCALAR +#undef KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_ROCSPARSE + +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index eee1812a06..ee0bf77c88 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -102,9 +102,9 @@ cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { template void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, const char mode[], - typename YVector::non_const_value_type const &alpha, + typename YVector::const_value_type &alpha, const AMatrix &A, const XVector &x, - typename YVector::non_const_value_type const &beta, + typename YVector::const_value_type &beta, const YVector &y) { static_assert(XVector::rank == 2, "should only be instantiated for multivector"); @@ -337,4 +337,203 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, #endif // defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +// rocSPARSE +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) +#include "KokkosSparse_Utils_rocsparse.hpp" + +namespace KokkosSparse { +namespace Impl { + +template +void spmv_mv_rocsparse(const Kokkos::HIP &exec, Handle *handle, + const char mode[], + typename YVector::const_value_type &alpha, + const AMatrix &A, const XVector &x, + typename YVector::const_value_type &beta, + const YVector &y) { + using offset_type = typename AMatrix::non_const_size_type; + using entry_type = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + + // initialize rocsparse library + rocsparse_handle rocsparseHandle = + KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; + // Set rocsparse to use the given stream until this function exits + TemporarySetRocsparseStream tsrs(rocsparseHandle, exec); + + rocsparse_operation rocsparseOperation = mode_kk_to_rocsparse(mode); + rocsparse_indextype offset_index_type = rocsparse_index_type(); + rocsparse_indextype entry_index_type = rocsparse_index_type(); + rocsparse_datatype compute_type = rocsparse_compute_type(); + + // Create rocsparse dense multivectors for X and Y + void *x_data = static_cast( + const_cast(x.data())); + void *y_data = static_cast( + const_cast(y.data())); + + size_t x_ld, y_ld; + rocsparse_order x_order, y_order; + if constexpr (std::is_same_v) { + x_ld = x.stride(1); + x_order = rocsparse_order_column; + } else { + static_assert( + std::is_same_v, + "rocsparse_spmm internal logic error: x is neither LayoutLeft nor " + "LayoutRight"); + x_ld = x.stride(0); + x_order = rocsparse_order_row; + } + if constexpr (std::is_same_v) { + y_ld = y.stride(1); + y_order = rocsparse_order_column; + } else { + static_assert( + std::is_same_v, + "rocsparse_spmm internal logic error: y is neither LayoutLeft nor " + "LayoutRight"); + y_ld = y.stride(0); + y_order = rocsparse_order_row; + } + + rocsparse_dnmat_descr vecX, vecY; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnmat_descr( + &vecX, x.extent(0), x.extent(1), x_ld, x_data, + rocsparse_compute_type(), + x_order)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnmat_descr( + &vecY, y.extent(0), y.extent(1), y_ld, y_data, + rocsparse_compute_type(), + y_order)); + + rocsparse_spmm_alg alg = rocsparse_spmm_alg_default; + + KokkosSparse::Impl::RocSparse_CRS_SpMV_Data *subhandle; + if (handle->tpl_rank2) { + subhandle = dynamic_cast( + handle->tpl_rank2); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: rank-2 subhandle is not set up for rocsparse " + "CRS"); + subhandle->set_exec_space(exec); + } else { + subhandle = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec); + handle->tpl_rank2 = subhandle; + // Create the rocsparse csr descr + // We need to do some casting to void* + // Note that row_map is always a const view so const_cast is necessary, + // however entries and values may not be const so we need to check first. + void *csr_row_ptr = + static_cast(const_cast(A.graph.row_map.data())); + void *csr_col_ind = + static_cast(const_cast(A.graph.entries.data())); + void *csr_val = + static_cast(const_cast(A.values.data())); + + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( + &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, + csr_col_ind, csr_val, offset_index_type, entry_index_type, + rocsparse_index_base_zero, compute_type)); + + // Size and allocate buffer, and analyze the matrix + + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmm( + rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, + subhandle->mat, vecX, &beta, vecY, compute_type, alg, + rocsparse_spmm_stage_buffer_size, &subhandle->bufferSize, nullptr)); + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMalloc(&subhandle->buffer, subhandle->bufferSize)); + + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmm( + rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, + subhandle->mat, vecX, &beta, vecY, compute_type, alg, + rocsparse_spmm_stage_preprocess, &subhandle->bufferSize, + subhandle->buffer)); + } + + // Perform the actual computation + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmm( + rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, + subhandle->mat, vecX, &beta, vecY, compute_type, alg, + rocsparse_spmm_stage_compute, &subhandle->bufferSize, subhandle->buffer)); + + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnmat_descr(vecY)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnmat_descr(vecX)); +} + +#define KOKKOSSPARSE_SPMV_MV_ROCSPARSE(SCALAR, XL, YL, MEMSPACE) \ + template <> \ + struct SPMV_MV< \ + Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + rocsparse_int const>, \ + Kokkos::View< \ + SCALAR const **, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const **, XL, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_mv(const Kokkos::HIP &exec, Handle *handle, \ + const char mode[], const coefficient_type &alpha, \ + const AMatrix &A, const XVector &x, \ + const coefficient_type &beta, const YVector &y) { \ + std::string label = "KokkosSparse::spmv_mv[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mv_rocsparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define INST_ROCSPARSE_SCALAR_MEMSPACE(SCALAR, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_ROCSPARSE(SCALAR, Kokkos::LayoutLeft, \ + Kokkos::LayoutLeft, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_ROCSPARSE(SCALAR, Kokkos::LayoutLeft, \ + Kokkos::LayoutRight, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_ROCSPARSE(SCALAR, Kokkos::LayoutRight, \ + Kokkos::LayoutLeft, MEMSPACE) \ + KOKKOSSPARSE_SPMV_MV_ROCSPARSE(SCALAR, Kokkos::LayoutRight, \ + Kokkos::LayoutRight, MEMSPACE) + +#define INST_ROCSPARSE_SCALAR(SCALAR) \ + INST_ROCSPARSE_SCALAR_MEMSPACE(SCALAR, Kokkos::HIPSpace) \ + INST_ROCSPARSE_SCALAR_MEMSPACE(SCALAR, Kokkos::HIPManagedSpace) + +INST_ROCSPARSE_SCALAR(float) +INST_ROCSPARSE_SCALAR(double) +INST_ROCSPARSE_SCALAR(Kokkos::complex) +INST_ROCSPARSE_SCALAR(Kokkos::complex) + +#undef INST_ROCSPARSE_SCALAR_MEMSPACE +#undef INST_ROCSPARSE_SCALAR +#undef KOKKOSSPARSE_SPMV_MV_ROCSPARSE + +} // namespace Impl +} // namespace KokkosSparse +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + #endif // KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index c8d25c2c58..be2588483f 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -30,9 +30,8 @@ namespace Impl { template void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], - typename YVector::non_const_value_type const& alpha, - const AMatrix& A, const XVector& x, - typename YVector::non_const_value_type const& beta, + typename YVector::const_value_type& alpha, const AMatrix& A, + const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using value_type = typename AMatrix::non_const_value_type; @@ -357,9 +356,8 @@ namespace Impl { template void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], - typename YVector::non_const_value_type const& alpha, - const AMatrix& A, const XVector& x, - typename YVector::non_const_value_type const& beta, + typename YVector::const_value_type& alpha, const AMatrix& A, + const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using entry_type = typename AMatrix::non_const_ordinal_type; @@ -703,9 +701,9 @@ template inline void spmv_onemkl(const execution_space& exec, Handle* handle, oneapi::mkl::transpose mkl_mode, - typename matrix_type::non_const_value_type const alpha, + typename yview_type::const_value_type& alpha, const matrix_type& A, const xview_type& x, - typename matrix_type::non_const_value_type const beta, + typename yview_type::const_value_type& beta, const yview_type& y) { using scalar_type = typename matrix_type::non_const_value_type; using onemkl_scalar_type = typename KokkosToOneMKLScalar::type; From 354cc4b14d1625e2e7af3fbf0994fa366e522cf4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:13:09 -0600 Subject: [PATCH 265/326] Bump step-security/harden-runner from 2.8.0 to 2.8.1 (#2236) Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.8.0 to 2.8.1. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/f086349bfa2bd1361f7909c78558e816508cdc10...17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 2 +- .github/workflows/dependency-review.yml | 2 +- .github/workflows/scorecards.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 20b6a634d3..ad5de0bf03 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 with: egress-policy: audit diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 7141d09337..f37ea6dda9 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 with: egress-policy: audit diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 4f4f03469a..a3ecec5cbc 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 with: egress-policy: audit From ece1c696d8e55c9427b9e8b6d61cc472a6fa5a88 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:13:36 -0600 Subject: [PATCH 266/326] Bump github/codeql-action from 3.25.7 to 3.25.8 (#2237) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.7 to 3.25.8. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/f079b8493333aace61c81488f8bd40919487bd9f...2e230e8fe0ad3a14a340ad0815ddb96d599d2aff) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index ad5de0bf03..06328c83c1 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,7 +44,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 + uses: github/codeql-action/init@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 + uses: github/codeql-action/analyze@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 with: category: "/language:c-cpp" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index a3ecec5cbc..2885ca7fae 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 + uses: github/codeql-action/upload-sarif@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 with: sarif_file: results.sarif From d06924a2e3aeaa33fb515ea7c81559e932680b0d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:14:00 -0600 Subject: [PATCH 267/326] Bump actions/dependency-review-action from 4.3.2 to 4.3.3 (#2235) Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 4.3.2 to 4.3.3. - [Release notes](https://github.com/actions/dependency-review-action/releases) - [Commits](https://github.com/actions/dependency-review-action/compare/0c155c5e8556a497adf53f2c18edabf945ed8e70...72eb03d02c7872a771aacd928f3123ac62ad6d3a) --- updated-dependencies: - dependency-name: actions/dependency-review-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dependency-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index f37ea6dda9..b911317970 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -24,4 +24,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - name: 'Dependency Review' - uses: actions/dependency-review-action@0c155c5e8556a497adf53f2c18edabf945ed8e70 # v4.3.2 + uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3 From 1696b1cfa3043a076e8f28c734fe36bace025ff5 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 10 Jun 2024 11:44:57 -0600 Subject: [PATCH 268/326] Add testing for transpose corner cases (#2234) * Add testing for transpose corner cases crs, bsr, graph: test cases that are 0x0, 100x0 and 0x100. In these cases make sure the matrix generator doesn't try to insert any entries (nnz = 0). * Update sparse/unit_test/Test_Sparse_Transpose.hpp Co-authored-by: Luc Berger * Update sparse/unit_test/Test_Sparse_Transpose.hpp Co-authored-by: Luc Berger --------- Co-authored-by: Luc Berger --- sparse/unit_test/Test_Sparse_Transpose.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_Transpose.hpp b/sparse/unit_test/Test_Sparse_Transpose.hpp index 05773b6b75..cf5dc81d24 100644 --- a/sparse/unit_test/Test_Sparse_Transpose.hpp +++ b/sparse/unit_test/Test_Sparse_Transpose.hpp @@ -55,7 +55,7 @@ void testTranspose(int numRows, int numCols, bool doValues) { using rowmap_t = typename crsMat_t::row_map_type::non_const_type; using entries_t = typename crsMat_t::index_type::non_const_type; using values_t = typename crsMat_t::values_type::non_const_type; - size_type nnz = 10 * numRows; + size_type nnz = (numRows * numCols > 0) ? 10 * numRows : 0; // Generate a matrix that has 0 entries in some rows crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, 3 * 10, numRows / 2); @@ -250,7 +250,7 @@ void testTransposeBsr(int numRows, int numCols, int blockSize) { using values_t = typename bsrMat_t::values_type::non_const_type; // Generate a matrix that has 0 entries in some rows - size_type nnz = 10 * numRows; + size_type nnz = (numRows * numCols > 0) ? 10 * numRows : 0; bsrMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( blockSize, numRows, numCols, nnz, 3, numRows / 4); @@ -294,6 +294,9 @@ void testTransposeBsr(int numRows, int numCols, int blockSize) { TEST_F(TestCategory, sparse_transpose_matrix) { // Test both matrix and graph transpose with various sizes + testTranspose(0, 0, true); + testTranspose(100, 0, true); + testTranspose(0, 100, true); testTranspose(100, 100, true); testTranspose(500, 50, true); testTranspose(50, 500, true); @@ -303,6 +306,9 @@ TEST_F(TestCategory, sparse_transpose_matrix) { } TEST_F(TestCategory, sparse_transpose_graph) { + testTranspose(0, 0, false); + testTranspose(100, 0, false); + testTranspose(0, 100, false); testTranspose(100, 100, false); testTranspose(500, 50, false); testTranspose(50, 500, false); @@ -314,6 +320,9 @@ TEST_F(TestCategory, sparse_transpose_graph) { TEST_F(TestCategory, sparse_transpose_bsr_matrix) { testTransposeBsrRef(); // Test bsrMatrix transpose with various sizes + testTransposeBsr(0, 0, 5); + testTransposeBsr(100, 0, 5); + testTransposeBsr(0, 100, 5); testTransposeBsr(100, 100, 3); testTransposeBsr(500, 50, 5); testTransposeBsr(50, 500, 16); From 2f26622fbef4610c51f567194bd8ec977454cb93 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 11 Jun 2024 11:16:43 -0600 Subject: [PATCH 269/326] Graph: removing executable from repo (#2239) --- example/graph/PartitioningExample | Bin 21536 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100755 example/graph/PartitioningExample diff --git a/example/graph/PartitioningExample b/example/graph/PartitioningExample deleted file mode 100755 index 88619a8d127f7c5acc2015424b160883008f33aa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21536 zcmeHPeQ;FQb-(%ofw3eowlPq9jRAuVYY8wGIF46Bi=T`D86--G@nfZ3!4iFSwE}@7 z2O@CU2qhYd6Vjv|Ch?@64AW3gN{K^3e1xTr$5j&IFsaih4mc|$J4!ehs;M@ko8R zX6ceg>VpgG!;P(N3)_m9E?m0A+tTD+B*#tqiT9~B)v_m2L-u)OO9+~O;>pKv5`N?H z(?$0|GBiHor9$%LUz-03_}zxzMEq{YFAu-5N(qQlQZT%k27fjUeoh+vkJI33jH&wl zZW{bA)8Hd%@Cz7U$mrn$ADsnKkS1|m_@1Tt7(xr4B6ZDw;Q($drzs1HYXnL7$WTOTcY%rv9h zB27EZnn2JD)YmuFf_u}N_0i&RQwxwSp(W*w;b>WrKP7$z;bBOFvH`WNwY6powTgsU z-MBN{7&O;~T3Q=I#8#`=>YAEEjWDXVEno(sp|)^TG=v&jLQzpyTi?_Y5_LNx;V3|J zB-|LS11HJypc(?#=;&>J^`WeL2O ziMY;^|8*@Jkxj8q*bMv~MxElvgJ0#`T?qFa@geF|i2hH2n%+7+j3tpJaxx?tRMMg7 zg!$38K80nIEhYm=GST^mu#_^zOy(bcWDBv$HAd@6muZDtWR=z@v&y!|f#(z<@$(#b zy>5w6=)mioMfhR|-kH~Z4!kq384kQNXH+`yItP>fjSf8J3|+Q3@RUPz+2+7ge$l1b zfv0?>OPd4l%t3n`c*=3Q7#)MTMrTg@JdZH;_e66Ne*(qmxR85Zag00~0cm7T6#w(@ z^PxuVZA9$v8$p?~o!T@N{k^hXLv5OZ{vO%hL~U|?|8dz~M{RO>|54d~n%d;*{%+YW zr8c>^e~)Z0r#88^zge~yQJY-azeTnmqV@!8SIYJ*YLg55eX>2B+T^k$WJ|+%*5H=k^F#pliq^ZUjo7us3oztj&RSqHHY-jy|G?*oalNs3 zkmfA*uV4N1$cPd9=Wc+w|Bd@#Q`~=vb~&SK|C^E)Py8pmeR2Q4qXlf2#CE+zP4FBA zI895~h{Y4{fzcs%4$ZiKkjB~3K6r2b7Y-o|;{Jp}B!FlGqSK!^+m{nN*XbY3=#BX= z0CrVoC@dj*Q0?f+aPpk8coc!yPImfx&h||tJr^>1joMx|&WOE5rX|R<*#2HQq?fKJ z4#l^y$(jx9>ZHl>4FmX6EFi*}r*vcsozG%Fo#N(Z$p z9o#l- z7Bwx25mL0weHls20-HGruMBI*{Z3%Vg5t_RAyi{%8KG z(|$pE1S=v$syjGB0PVG0EYgfFzyu?2N+K*RL0HhT;1DQeVKYbeUlP-EH3R# zbp3%G)0;8yl@9;qjDZ)mZM2^$+j6JFwi94eqX)W1ts8n)w}L)Wn@1Ivxrd0=%O zeHCLcVpqtHYEmGHlt2mdu(VbZi;1}R0ww5AZaz=>8oP8sto>%Jb-1JbW=89Sd)tSR zj796cXTFSmCGjh?Uh78fG&RrAF>s;YB6`0ifWr$? zUdd1x6JAF*19L0}TjTG05eFK7Ke186p`YxHRSzl_%iPZDL1nBS7mRQ115={Ymm52W zN}qxFh>;n7j8%2mWjqP{jeO%QHr#Q>**Fv%zbE4cG%~bD^*^NQlEt2VgXiEi zlE8*bkrYo{Metyf)wnwbJ(A1z5gcQ5uEwmsNzDJEm^q$^m*$;(^cuwmJpZG2)TnZ? zT(r~=K9r#jv#W>go(gnNYG5=Ka+`ggq8a0{Iep+nRJZm5eMN-QzKgfel+_eTA^N&DCE`?BL9TsSNN?+H3QQr zH^@+oC(4LXWsUoO2M2XrnCHj}a)fNvbca?H ztYfmjO_MF>KAyOYy2>BWRR~>}JT0D%_Bqv@VQPYLU}=1AB~)2ShQF zie9I7UC$MdgnH1v*Z$=o+6*&f)cl%#TR=T``O&;+#D1Q5!fM77caj=*m|i1E^H@sE z+-DTgr~AZ+D>34eh}+{vNBeORZ3f#(^n9FqZ6isQQGkjkj>vB91+CUH6jXY22c1%c z+}$HrF^8ZJ?qTIUNIjj^mykXN7E4sz=aEaj%io*4@Nct)dZo}62ubtEOchDb6OXmh zr1gEU2R$G`O|w^thvI}X_f_lR2L;H5JK_9&nO5VriF2CPy1C79seD8Yh)nj zJJLLOP^Fmnf0N`xpA|J-l?9H7*sQF_I>ZXMEyVJuPUuMK7__B3WkPZ~?bm;jPWuQA z1?V3gaQekCDw9`;qVxRwQqiO=vYX_Q(L9Gd!6Hv85qCx<7viL^N;~f;oJ2!2I;YS^ zLZ^4uUgSe~zzJvH$ufTbQ=@BAffSN{kxBaw)t#J#rD-jsWEoFPrG8?x$}wnZY&9f! z4eLF;qX#FLiE+t0`{;ef=!9h$#B`~R8XbvT>g0Qrq&s?Y`N+=JKIXa}?Z;~{){eoX znaHK%9~wkQ5AgBCFb-v8o0o$v=CLMN?Iv(BWp^6C^d6nYJJvL@CsZ896JJwWAws(s zEsJ3+3kZV)(JGh1L?f1qJwvW7;79eZ%$9>w{W@dxF?6n@nd8v%kz=ps<1g3{PrUhl z()w`?kA@Xb%%WLd<{!%c5*?K1B(_5a*5c9uj$lClCffF_7$<3=qU6&B0iP)O%aYhg z$%gXS2PM_zu^*RIRdmhSHq|4ljjn~wfa@#fUn}|#4MQGHYRjg#7%$;0CxaZ z0KNiP4>%3G_CY`c@EG88fNucS0}cXy1CY*_Uj6r&C`K!PWfbLJC`z(4k;=15Qdi!RBei!Hqobp@k{^>|@7U)kn>1{SW2)YmS z`A&M!rtbs&Y0%d?>6uSk_Pq@H?lJnm0Q!rd7dquXW$Pz&#_q}r84=^N=oV3yiVHQ* zvq7)HJRe7#XU}{9$%^OD=*Y_YdX8ns0<70H=x2trpAB|D6`<2NQqif8gZQlkeK%ls zk)t1fM^-ZG97&^x;^Et%9|pYwb<5Vwrz|_^YEKX7!8c)`g*J@ z?f$1C&%8ATeF^C2Ku(D#9!F1{~-ei(GRcWK%8jIIAF z=$`{!-j~t-E4S&{i2F9smpkQKZTf7`4}qR8Udlj!VGKI$FWW(%>(n2x_3s0{8FV^u zvFwl7^p`>ZYtX$;df2940R4-gKjWllR!9c&|5ecApv%0k$M49`+1Q_c2)evKrRggD z$oZWO`Z>@aaO!`~?!OH5i=dY}>FqW>2zml^xz}j@Yi;^I&<|l>`nXemmrZ{e^y8r4 z>7-XGdKqvRKtBroA)=S&6?`GnpEvFES^hjvSGGTI-oYFrukgijMqY8p_|*;8x?72X1-bmIrQm;Fbq&dEk}@Zh7FA2mYTuuw{ZzT2Ps!>%?xb4X>90-|}mPuBD^gO~47?R%K6pRbnZh`<#vemcXUgcz_DP6Ce_47neG*QT|Am;xhRbs{;HEKNo31C$$^5xO$5%3bg3xi8jGt(&$7K9%qMPR_ z89zzvN#Xx2JW=1B5}zmZ`cBr9FZy`hC*upO_ZO1!lf|VOJ|{}nv+$%jU50d*EOCeR zzJ`;Qg;lEaM>76SysxpviIVj!ahK4~IXj720;`+zKr()sIFJ&5kD%vA-6c!ttY%Xv zon(vO;DMmzLisZje(rJknXXk4e#nKtpYb|>Yi)GxC_~uxaQYG}C4RJj8d#6s7qp&T zl7F#6yCSwH6q{^ zfPwIOKht@g51`6CoB8{#!tsaMtVMbm%__y zub{1Ed^h`Z4)cGC@vm~fdf)s4@CDdC-Esa!=Ktv(O26iRjq!Ed?+n%-XZ&{7Pw`FV zmyDl2#V6a^{%mZBWWRg7vl&0cdMNIxY-Ie$**{$hE#_eZ_P~Dk`u#NcC*!Rnov-w9 zMknie*EOzh0Y8OxI>C>b|A*6j5-qQxqV^%<^QJ2NDoGXMeso0R`Y+b6{Zq+!_qgcV zIPn+o{Ob979P>T}czO<=3f;S*@^{QXoBgBrz)Osu=j!(tz^57)J;X`r@INilIK_3Q-^?=g15EY`$$;xbNtAA4$!lQ z@!5VZ!^Az{i(@G-x^d44MM*8@-SA zcIB%+#=G<3pBV3s6MCkf#_P_L&oKUw%m2?YeiO&ZgFLPm82=CK9~~!jZ=UqK^JFjM z-Shi1#(%%SCpG(7{|Mu&c)apn7ie=450szX`E3XCLjjHlj=I+AV(^na53-$J)?X*% ziW?5_Jo9g0{%4qWLiTGPZ)lNo%>O#uIhP4PWV}0%4FI2N-U|>{WT$&yD3bAZyP(&A zsD$Ky+%{u=_x=@P{5a+>W&Xcm{7P4z`6lDP&3Jio2AQ7#Pj(jO`dEVk!_4o_{~qKw zvfmw_Wisz2sgR9lCH{6nZ#hxHwkS(I>v8Ap{fu|}`6b4?_nq%Z{jAght}uTY=gA7D z{RViCgU_nsfR)DOj(-p1U*~+XmIanG-km=;Fy6hsYJpD`4?CIPy>EVl`I}wyB5EVi zmS}5Towrs5Ly^$da7#24F{2G8-r;NvwcxGIV3WDEzNscqZw8}Hkrp%1+9qn78k*}v z(NNI)*y6&+QZt!#;l{8Th(rRr%ur)AvP;xO0u3QE*xJyr3nGq|3C?H|YbD-%4M)RG zjp4?v{L}~E)8#O zY>I?bqu4aVP1GNnONvZ-A2k}Tl@^5YR_$)I@ak;Gh7)x+AA zE7$w0%&L;o3O~L6yJi($@?C6}?g-V=!03hE^7Vo88o%Fx4Z}%AMdoTa8!rI|gAx6* zZ&9&X3isk&+=%oGiQ|>u%7*QvT4{FB-K3A~6awp3!JG!aHg$>YFcPXmL^aliYIoEq zTbAJ!=EhdMG2B+PxG}U-y{b$$L3YI=;t2%<(EzDQ&sxK*FrE73mzpsQOePIG)Y8&a z2SR0)xo&--bRUSzq)9SbEKi_I+X<hnRj1`OYHdh^)GO)iAAZLi%vew^ z?!{O(6oF>2rODie^P?4O7wt6;^{>u8#3Y0Sea@MYL??dZ{w?f^nJh zXbtjkqh7u)B4(yx-KrXBP_r|7BuiStyF*HPv>`#Jj2b`uOp%GCOIgtJseANt&CnML z?AQKBId`nJ%gdEMXfP^hN87Y4$tH70puRQ4-ly~jU+mzo4Y-ZKXB1NCcSTuq1ua9J zNYnxwBl9%)iwc7wE9xp6WZbPvKimd?aYLZZ)bn9htzN-9h5UGe+hz*3lp!S-Y^snN ztLxBq~#peEDWBGyM6!g4jH$i(Z%&Va?{db4q7QzS^)MJ9Ij=?Y%@>5NW_ z5VD)1gg=3iYQAkpI6Oe}H^97#QS#?yZIBh@NN6NGZJ zRA2F^ZE2Osg1nJ_QsiDEdmf9&p672%HpFxmE!X`zAf@m zW=m(P02#}46~#$kNP$23vnKq(6LJ+D3ah|zEjVi#sJ(%fkz)!+FFdu-)xh=j~pQ6&LNsp>N@Da=cq7#vtiEtZ2)e9I;| zbcMHNS3@*V0~n1cxJ}pa0h&;xS$G?pq9HH7FS9Tj*ebl+0xjEwH@K@2f(k|>!iytR z?+$!q2Z#Mh4HL9Ts6IdfTx+h63a>nu@S^J7inm3)_!18O%L(^Jn&jcEH?)n_hg7Ccgp>|lN1uU)a4kfdqSP16r2}Oz@__K-Y^(uiSk}4c z|DrMjvbZE-omcDoe)_v(`o6R7f_R;Uu6N-Qk|kf2=|ju@u9ru)?0>j?npBF=!X-va zUf)Ng_Zg{FDS&$ys^D^uEV@nCxp7Kv$@gaZ&=R=sD1q3+22&qY)$-F6Sv(1dOVXCS zzAs7FovG;i<*H@hf6?uyP@#L(`nrU^f7wPIqIl$g`+mLV(~!nT_r7&o-{(BS^35zy zO|nzl&uCGL3Yn_q_5ILXT*suM$Ep3LYhZd5y(e`RN4c`V2#jcju_r>-147Fo;Ia=|r z?b-`j8j~ge$QHEle%vDxSp2GG+oSux27*)Gy3byyBo1gnhFVU;_g(V(diMdA*LG?( zT28lbph7rpzrO!)gyr4*?)m!zWC_RYsHvjm^?hlL*YZxNV^8xCyuu2HZ&z5_hZfA&{u^Y&_cH(h From e275401eb806bc3b02304f74569c7984f02deac0 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 11 Jun 2024 14:45:27 -0600 Subject: [PATCH 270/326] Fix logic around merge path with TPLs (#2240) SPMV_MERGE_PATH is not always a native algorithm. Add SPMV_NATIVE_MERGE_PATH to cover that case specifically. Test this new option. --- sparse/impl/KokkosSparse_spmv_impl.hpp | 8 ++++---- sparse/src/KokkosSparse_spmv_handle.hpp | 12 +++++++++--- sparse/unit_test/Test_Sparse_spmv.hpp | 6 +++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index a2bb19a44c..f1f4c5700e 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -32,8 +32,6 @@ namespace KokkosSparse { namespace Impl { -constexpr const char* KOKKOSSPARSE_ALG_NATIVE_MERGE = "native-merge"; - // This TransposeFunctor is functional, but not necessarily performant. template @@ -609,7 +607,8 @@ static void spmv_beta(const execution_space& exec, Handle* handle, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - if (handle->algo == SPMV_MERGE_PATH) { + if (handle->algo == SPMV_MERGE_PATH || + handle->algo == SPMV_NATIVE_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { @@ -617,7 +616,8 @@ static void spmv_beta(const execution_space& exec, Handle* handle, dobeta, false>(exec, handle, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { - if (handle->algo == SPMV_MERGE_PATH) { + if (handle->algo == SPMV_MERGE_PATH || + handle->algo == SPMV_NATIVE_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index 6d23d2bde1..38f6056615 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -36,8 +36,11 @@ enum SPMVAlgorithm { /// is only used once. SPMV_NATIVE, /// Use the best KokkosKernels implementation, even if a TPL /// implementation is available. - SPMV_MERGE_PATH, /// Use load-balancing merge path algorithm (for CrsMatrix - /// only) + SPMV_MERGE_PATH, /// Use algorithm optimized for matrices with + /// imbalanced/irregular sparsity patterns (merge path or + /// similar). May call a TPL. For CrsMatrix only. + SPMV_NATIVE_MERGE_PATH, /// Use the KokkosKernels implementation of merge + /// path. For CrsMatrix only. SPMV_BSR_V41, /// Use experimental version 4.1 algorithm (for BsrMatrix only) SPMV_BSR_V42, /// Use experimental version 4.2 algorithm (for BsrMatrix only) SPMV_BSR_TC /// Use experimental tensor core algorithm (for BsrMatrix only) @@ -59,6 +62,7 @@ inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) { case SPMV_FAST_SETUP: return "SPMV_FAST_SETUP"; case SPMV_NATIVE: return "SPMV_NATIVE"; case SPMV_MERGE_PATH: return "SPMV_MERGE_PATH"; + case SPMV_NATIVE_MERGE_PATH: return "SPMV_NATIVE_MERGE_PATH"; case SPMV_BSR_V41: return "SPMV_BSR_V41"; case SPMV_BSR_V42: return "SPMV_BSR_V42"; case SPMV_BSR_TC: return "SPMV_BSR_TC"; @@ -73,10 +77,11 @@ inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) { inline bool is_spmv_algorithm_native(SPMVAlgorithm a) { switch (a) { case SPMV_NATIVE: - case SPMV_MERGE_PATH: + case SPMV_NATIVE_MERGE_PATH: case SPMV_BSR_V41: case SPMV_BSR_V42: case SPMV_BSR_TC: return true; + // DEFAULT, FAST_SETUP and MERGE_PATH may call TPLs default: return false; } } @@ -351,6 +356,7 @@ struct SPMVHandle } else { switch (get_algorithm()) { case SPMV_MERGE_PATH: + case SPMV_NATIVE_MERGE_PATH: throw std::invalid_argument(std::string("SPMVHandle: algorithm ") + get_spmv_algorithm_name(get_algorithm()) + " cannot be used if A is a BsrMatrix"); diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 2057a8ba14..0921a1b45a 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -518,7 +518,11 @@ template (algo, numRows, nnz, bandwidth, row_size_variance, heavy); } From efbf210d923e407dd0a3048a24497512a0f0ee32 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 11 Jun 2024 23:02:16 -0600 Subject: [PATCH 271/326] spgemm unit test: change matrix value distribution (#2241) Change the distribution A, B values are sampled from so that values in C can't end up close to 0 (as the result of summing terms that are larger). The relative error metric in is_same_matrix is sensitive to this. Fixes #2232 --- common/src/KokkosKernels_SimpleUtils.hpp | 4 ++-- sparse/unit_test/Test_Sparse_spgemm.hpp | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index b447f13397..055c1d6d32 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -342,9 +342,9 @@ struct IsRelativelyIdenticalFunctor { if (val_diff > mag_type(eps)) { Kokkos::printf( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " - "(eps = %e)\n", + "(eps = %e, rel err = %e)\n", (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), - KAT::imag(view2(i)), eps); + KAT::imag(view2(i)), eps, val_diff); num_diffs++; } } diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index bd1e68c370..139e47dcef 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -69,7 +69,10 @@ void randomize_matrix_values(const Values &v) { ScalarType randStart, randEnd; KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd); Kokkos::Random_XorShift64_Pool pool(13718); - Kokkos::fill_random(v, pool, randStart, randEnd); + // Instead of sampling from [-50, 50] or [-50-50i, 50+50i], + // sample from [1, 50] or [1+i, 50+50i]. That way relative + // error between values can't become large if values happen to sum close to 0. + Kokkos::fill_random(v, pool, randEnd / 50.0, randEnd); } template @@ -254,6 +257,8 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, m, k, nnz, row_size_variance, bandwidth); crsMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix( k, n, nnz, row_size_variance, bandwidth); + randomize_matrix_values(A.values); + randomize_matrix_values(B.values); KokkosSparse::sort_crs_matrix(A); KokkosSparse::sort_crs_matrix(B); From f400cc98fbdd060224e9b8662a17a065e7958cbf Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 12 Jun 2024 15:18:25 -0600 Subject: [PATCH 272/326] kokkoskernels_tpls.cmake: remove duplicates arguments when creating argument for exported INTERFACE_INCLUDE_DIRECTORIES Attempt to workaround issue #2238 --- cmake/kokkoskernels_tpls.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 6af952ce94..49d1adcdcb 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -330,6 +330,9 @@ MACRO(kokkoskernels_export_imported_tpl NAME) GET_TARGET_PROPERTY(TPL_INCLUDES ${TPL_IMPORTED_NAME} INTERFACE_INCLUDE_DIRECTORIES) IF(TPL_INCLUDES) + # remove duplicates to prevent incorrect number of arguments to INTERFACE_INCLUDE_DIRECTORIES + # see issue #2238 + LIST(REMOVE_DUPLICATES TPL_INCLUDES) KOKKOSKERNELS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES ${TPL_INCLUDES}") ENDIF() From e220db3603057d5ce7c70f9fc69beaacb407b918 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 12 Jun 2024 16:50:50 -0600 Subject: [PATCH 273/326] Sparse - BsrMatrix: adding new wiki example for documentation (#2228) There is already an example for this but it uses a CrsMatrix as starting point to build a BsrMatrix which is not really helpful in general as the hope is that you can use the BsrMatrix without needing the CrsMatrix as it would double the storage needed... Addressing Kim's comments --- example/wiki/sparse/CMakeLists.txt | 5 + .../sparse/KokkosSparse_wiki_bsrmatrix_2.cpp | 247 ++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp diff --git a/example/wiki/sparse/CMakeLists.txt b/example/wiki/sparse/CMakeLists.txt index 16d6a3a89d..8d061c24f8 100644 --- a/example/wiki/sparse/CMakeLists.txt +++ b/example/wiki/sparse/CMakeLists.txt @@ -10,6 +10,11 @@ if (KOKKOSKERNELS_ENABLE_EXPERIMENTAL) ) endif() +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_bsrmatrix_2 + SOURCES KokkosSparse_wiki_bsrmatrix_2.cpp + ) + KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( wiki_crsmatrix SOURCES KokkosSparse_wiki_crsmatrix.cpp diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp new file mode 100644 index 0000000000..7ff56ff14a --- /dev/null +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp @@ -0,0 +1,247 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include + +#include "Kokkos_Core.hpp" + +#include "KokkosKernels_default_types.hpp" +#include "KokkosSparse_BsrMatrix.hpp" + +using Scalar = default_scalar; +using Ordinal = default_lno_t; +using Offset = default_size_type; +using Layout = default_layout; + +template +struct bsr_fill { + bsrmatrix_type bsr_mat; + + bsr_fill(bsrmatrix_type bsr_mat_) : bsr_mat(bsr_mat_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& rowIdx) const { + if (rowIdx == 0) { // Left boundary condition + auto block_tmp = bsr_mat.unmanaged_block(0); + block_tmp(0, 0) = 1.0; + block_tmp(0, 1) = 0.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = 1.0; + } else if (rowIdx == bsr_mat.numRows() - 1) { // Right boundary condition + auto block_tmp = + bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + block_tmp(0, 0) = 1.0; + block_tmp(1, 1) = 1.0; + } else { + auto block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx)); + block_tmp(0, 0) = -1.0; + block_tmp(0, 1) = -1.0 / 2.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = -1.0; + + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + block_tmp(0, 0) = 2.0; + block_tmp(0, 1) = 0.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = 2.0; + + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 2); + block_tmp(0, 0) = -1.0; + block_tmp(0, 1) = 1.0 / 2.0; + block_tmp(1, 0) = 0.0; + block_tmp(1, 1) = -1.0; + } + } +}; + +template +struct diagonal_extractor { + using graph_type = typename bsrmatrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; + using bsr_block_type = typename bsrmatrix_type::block_type; + + bsrmatrix_type bsr_mat; + row_map_type row_map; + entries_type entries; + diag_blocks_type diag_blocks; + + diagonal_extractor(bsrmatrix_type bsr_mat_, diag_blocks_type diag_blocks_) + : bsr_mat(bsr_mat_), + row_map(bsr_mat_.graph.row_map), + entries(bsr_mat_.graph.entries), + diag_blocks(diag_blocks_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& rowIdx) const { + for (Offset entryIdx = row_map(rowIdx); entryIdx < row_map(rowIdx + 1); + ++entryIdx) { + if (entries(entryIdx) == rowIdx) { + bsr_block_type bsr_diag_block = bsr_mat.unmanaged_block(entryIdx); + for (int i = 0; i < bsr_mat.blockDim(); ++i) { + for (int j = 0; j < bsr_mat.blockDim(); ++j) { + diag_blocks(rowIdx, i, j) = bsr_diag_block(i, j); + } + } + } + } + } +}; + +int main(int argc, char* argv[]) { + using device_type = typename Kokkos::Device< + Kokkos::DefaultExecutionSpace, + typename Kokkos::DefaultExecutionSpace::memory_space>; + using bsrmatrix_type = + typename KokkosSparse::Experimental::BsrMatrix; + using graph_type = typename bsrmatrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; + + Kokkos::initialize(argc, argv); + { + // + // We will create a 1D discretization for the coupled thermo-elastic + // diffusion + // + // -\div(EA \grad_s(u) - \alpha(T-T0)I) = f_u + // -\kappa\Delta(T) = f_T + // + // The problem is discretized using finite differences as follows: + // \frac{d^2 u}{dx^2}\approx \frac{u_{i+1}-2u_i+u_{i-1}}{h_x^2} + // \frac{dT}{dx}\approx\frac{T_{i+1}-T_{i-1}}{2h_x} + // \frac{d^2T}{dx^2}\approx\frac{T_{i+1}-2T_i+T_{i-1}}{h_x^2} + // + // This leads to the combined stencil (assuming all unit coefficients): + // + // [-1 1/2] [2 0] [-1 -1/2] + // [ 0 -1] [0 2] [ 0 -1] + // + // First the graph for the mesh will be constructed. + // Second a BsrMatrix will be constructed from the graph + // Third the values of the BsrMatrix will be filled. + + constexpr Ordinal blockSize = 2; + constexpr Ordinal numRows = 10; + constexpr Offset numNNZ = 3 * numRows - 2; + bsrmatrix_type bsr_mat; + + { + typename row_map_type::non_const_type row_map( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row pointers"), + numRows + 1); + typename entries_type::non_const_type entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "column indices"), + numNNZ); + typename row_map_type::HostMirror row_map_h = + Kokkos::create_mirror_view(row_map); + typename entries_type::HostMirror entries_h = + Kokkos::create_mirror_view(entries); + + // First Step: build the CrsGraph + { + // Build the row pointers and store numNNZ + + row_map_h(0) = 0; + for (Ordinal rowIdx = 0; rowIdx < numRows; ++rowIdx) { + if (rowIdx == 0) { + row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 2; + + entries_h(row_map_h(rowIdx)) = rowIdx; + entries_h(row_map_h(rowIdx) + 1) = rowIdx + 1; + } else if (rowIdx == numRows - 1) { + row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 2; + + entries_h(row_map_h(rowIdx)) = rowIdx - 1; + entries_h(row_map_h(rowIdx) + 1) = rowIdx; + } else { + row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 3; + + entries_h(row_map_h(rowIdx)) = rowIdx - 1; + entries_h(row_map_h(rowIdx) + 1) = rowIdx; + entries_h(row_map_h(rowIdx) + 2) = rowIdx + 1; + } + } + + if (row_map_h(numRows) != numNNZ) { + std::ostringstream error_msg; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" + << row_map_h(numRows) << ", numNNZ=" << numNNZ; + throw std::runtime_error(error_msg.str()); + } + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + } + + graph_type myGraph(entries, row_map); + + // Second Step: build the BsrMatrix from graph and block size + bsr_mat = bsrmatrix_type("block matrix", myGraph, blockSize); + + bsr_fill fillFunctor(bsr_mat); + Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), fillFunctor); + + std::cout << "BsrMatrix graph: " << std::endl; + for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) { + std::cout << " ["; + for (int colIdx = 0; colIdx < entries_h(row_map_h(rowIdx)); ++colIdx) { + std::cout << " "; + } + std::cout << "*"; + for (Offset entryIdx = row_map_h(rowIdx); + entryIdx < row_map_h(rowIdx + 1) - 1; ++entryIdx) { + for (int colIdx = entries_h(entryIdx) + 1; + colIdx < entries_h(entryIdx + 1); ++colIdx) { + std::cout << " "; + } + std::cout << "*"; + } + for (int colIdx = entries_h(row_map_h(rowIdx + 1) - 1) + 1; + colIdx < numRows; ++colIdx) { + std::cout << " "; + } + std::cout << "]" << std::endl; + } + } + + // Extract diagonal block and store them in a rank-3 view + using diag_blocks_type = + Kokkos::View; + diag_blocks_type diag_blocks("diagonal blocks", numRows, blockSize, + blockSize); + diagonal_extractor myFunc(bsr_mat, diag_blocks); + Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), myFunc); + + auto diag_blocks_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, diag_blocks); + + std::cout << "\nBsrMatrix diagonal blocks: " << std::endl; + for (int blockId = 0; blockId < diag_blocks_h.extent_int(0); ++blockId) { + std::cout << " [" << diag_blocks_h(blockId, 0, 0) << ", " + << diag_blocks_h(blockId, 0, 1) << "]" << std::endl; + std::cout << " [" << diag_blocks_h(blockId, 1, 0) << ", " + << diag_blocks_h(blockId, 1, 1) << "]\n" + << std::endl; + } + } + Kokkos::finalize(); + + return 0; +} From 3ad65b226a419023d8b45ffe9158fc3a650d3aec Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 13 Jun 2024 08:45:34 -0600 Subject: [PATCH 274/326] Sparse - CrsToBsr: fix type mismatch (#2242) --- sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp index 7f1ff2171e..f773bdc0d8 100644 --- a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp +++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp @@ -99,6 +99,7 @@ template Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) { using bsr_value_type = typename Bsr::value_type; using bsr_ordinal_type = typename Bsr::ordinal_type; + using crs_size_type = typename Crs::non_const_size_type; // copy matrix data to host auto hRowMap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), @@ -119,7 +120,7 @@ Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) { for (bsr_ordinal_type row = 0; row < bsr_ordinal_type(hRowMap.size()) - 1; ++row) { - for (size_t ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) { + for (crs_size_type ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) { bsr_ordinal_type col = hColInds(ci); bsr_value_type val = hVals(ci); From 119eb18cefd8165d743ade19642678ffb933e465 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 13 Jun 2024 09:45:20 -0600 Subject: [PATCH 275/326] Update rocsparse algo defaults (#2245) * Update default spmv algorithms for rocsparse - Use stream for common cases (default, fast setup) as it has nearly zero setup cost and performs well for somewhat balanced matrices - Use adaptive (which is rocsparse's default) only if SPMV_MERGE_PATH is the algorithm, as it has a very high setup cost * Re-enable rocsparse spmv for SPMV_FAST_SETUP --- sparse/src/KokkosSparse_spmv.hpp | 7 ------- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 10 +++++++++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index ddbef56504..5fa0be3619 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -247,13 +247,6 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], YVector_Internal y_i(y); bool useNative = is_spmv_algorithm_native(handle->get_algorithm()); - // Also use the native algorithm if SPMV_FAST_SETUP was selected and - // rocSPARSE is the possible TPL to use. Native is faster in this case. -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - if (handle->get_algorithm() == SPMV_FAST_SETUP && - std::is_same_v) - useNative = true; -#endif // Now call the proper implementation depending on isBSR and the rank of X/Y if constexpr (!isBSR) { diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index be2588483f..33eb052135 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -392,7 +392,15 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], &vecY, y.extent_int(0), y_data, rocsparse_compute_type())); - rocsparse_spmv_alg alg = rocsparse_spmv_alg_default; + // Default to using the "stream" algorithm which has almost no setup cost, + // and performs well for reasonably balanced matrices + rocsparse_spmv_alg alg = rocsparse_spmv_alg_csr_stream; + if (handle->get_algorithm() == SPMV_MERGE_PATH) { + // Only use the "adaptive" algorithm if the user has indicated that the + // matrix is very imbalanced, by asking for merge path. This algorithm + // has fairly expensive setup + alg = rocsparse_spmv_alg_csr_adaptive; + } KokkosSparse::Impl::RocSparse_CRS_SpMV_Data* subhandle; if (handle->tpl_rank1) { From 774eff42fb66b017be4b0ea598057ad378ff9351 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 13 Jun 2024 11:19:25 -0600 Subject: [PATCH 276/326] In deprecated spmv, fix Controls algorithm mapping (#2246) native -> SPMV_NATIVE native-merge -> SPMV_NATIVE_MERGE_PATH merge -> SPMV_MERGE_PATH tpl -> SPMV_FAST_SETUP --- sparse/src/KokkosSparse_spmv_deprecated.hpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv_deprecated.hpp b/sparse/src/KokkosSparse_spmv_deprecated.hpp index f29caaec0c..0faef2ef4e 100644 --- a/sparse/src/KokkosSparse_spmv_deprecated.hpp +++ b/sparse/src/KokkosSparse_spmv_deprecated.hpp @@ -191,20 +191,18 @@ spmv(const ExecutionSpace& space, // Default to fast setup, since this handle can't be reused SPMVAlgorithm algo = SPMV_FAST_SETUP; // Translate the Controls algorithm selection to the SPMVHandle algorithm. - // This maintains the old behavior, where any manually set name that isn't - // "tpl" gives native. - // - // This also uses the behavior set by #2021: "merge" was a hint to use - // cuSPARSE merge path, but that path is gone so just use the normal TPL. - // "merge-path" means to use the KK merge-path implementation. // // And also support the 3 different BSR algorithms by their old names. if (controls.isParameter("algorithm")) { std::string algoName = controls.getParameter("algorithm"); - if (algoName == "merge" || algoName == "tpl") + if (algoName == "tpl") algo = SPMV_FAST_SETUP; - else if (algoName == "native-merge") + else if (algoName == "native") + algo = SPMV_NATIVE; + else if (algoName == "merge") algo = SPMV_MERGE_PATH; + else if (algoName == "native-merge") + algo = SPMV_NATIVE_MERGE_PATH; else if (algoName == "v4.1") algo = SPMV_BSR_V41; else if (algoName == "v4.2") From b5e6fa915da060f373e93c4500001bf3c10705f0 Mon Sep 17 00:00:00 2001 From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com> Date: Mon, 17 Jun 2024 21:55:01 +0200 Subject: [PATCH 277/326] Add batched serial tbsv (#2202) * Add batched serial tbsv * remove incx argument and use strided views instead * Add a new line at the end of files * fix random number generation for complex numbers * remove unused variables from internal tbsv serial functions * remove allclose for testing --------- Co-authored-by: Yuuichi Asahi --- .../impl/KokkosBatched_Tbsv_Serial_Impl.hpp | 169 +++++++++ .../KokkosBatched_Tbsv_Serial_Internal.hpp | 224 +++++++++++ batched/dense/src/KokkosBatched_Tbsv.hpp | 56 +++ .../dense/unit_test/Test_Batched_Dense.hpp | 3 + .../unit_test/Test_Batched_DenseUtils.hpp | 61 ++- .../unit_test/Test_Batched_SerialTbsv.hpp | 349 ++++++++++++++++++ .../Test_Batched_SerialTbsv_Complex.hpp | 120 ++++++ .../Test_Batched_SerialTbsv_Real.hpp | 137 +++++++ blas/impl/KokkosBlas_util.hpp | 1 + 9 files changed, 1118 insertions(+), 2 deletions(-) create mode 100644 batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp create mode 100644 batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp create mode 100644 batched/dense/src/KokkosBatched_Tbsv.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialTbsv.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp new file mode 100644 index 0000000000..675e73f744 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp @@ -0,0 +1,169 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Tbsv_Serial_Internal.hpp" + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkTbsvInput( + [[maybe_unused]] const AViewType &A, [[maybe_unused]] const XViewType &x, + [[maybe_unused]] const int k) { + static_assert(Kokkos::is_view::value, + "KokkosBatched::tbsv: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::tbsv: XViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::tbsv: AViewType must have rank 2."); + static_assert(XViewType::rank == 1, + "KokkosBatched::tbsv: XViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (k < 0) { + Kokkos::printf( + "KokkosBatched::tbsv: input parameter k must not be less than 0: k = " + "%d\n", + k); + return 1; + } + + const int lda = A.extent(0), n = A.extent(1); + if (lda < (k + 1)) { + Kokkos::printf( + "KokkosBatched::tbsv: leading dimension of A must be smaller than k+1: " + "lda = %d, k = %d\n", + lda, k); + return 1; + } + + const int nx = x.extent(0); + if (nx != n) { + Kokkos::printf( + "KokkosBatched::tbsv: Dimensions of x and A do not match: X: %d, A: %d " + "x %d\n" + "x.extent(0) must be equal to A.extent(1)\n", + nx, lda, n); + return 1; + } +#endif + return 0; +} + +//// Lower non-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalLower::invoke( + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Lower transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalLowerTranspose::invoke( + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Lower conjugate-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalLowerTranspose::invoke( + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Upper non-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalUpper::invoke( + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Upper transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalUpperTranspose::invoke( + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +//// Upper conjugate-transpose //// +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &x, const int k) { + auto info = checkTbsvInput(A, x, k); + if (info) return info; + + return SerialTbsvInternalUpperTranspose::invoke( + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), k); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp new file mode 100644 index 0000000000..d2f5df4649 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp @@ -0,0 +1,224 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { + +/// +/// Serial Internal Impl +/// ==================== + +/// +/// Lower, Non-Transpose +/// + +template +struct SerialTbsvInternalLower { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalLower::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = 0; j < an; ++j) { + if (x[j * xs0] != static_cast(0)) { + if (!use_unit_diag) x[j * xs0] = x[j * xs0] / A[0 + j * as1]; + + auto temp = x[j * xs0]; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = j + 1; i < Kokkos::min(an, j + k + 1); ++i) { + x[i * xs0] = x[i * xs0] - temp * A[(i - j) * as0 + j * as1]; + } + } + } + + return 0; +} + +/// +/// Lower, Transpose +/// + +template +struct SerialTbsvInternalLowerTranspose { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalLowerTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = an - 1; j >= 0; --j) { + auto temp = x[j * xs0]; + + if (do_conj) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::min(an - 1, j + k); i > j; --i) { + temp -= + Kokkos::ArithTraits::conj(A[(i - j) * as0 + j * as1]) * + x[i * xs0]; + } + if (!use_unit_diag) + temp = temp / Kokkos::ArithTraits::conj(A[0 + j * as1]); + } else { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::min(an - 1, j + k); i > j; --i) { + temp -= A[(i - j) * as0 + j * as1] * x[i * xs0]; + } + if (!use_unit_diag) temp = temp / A[0 + j * as1]; + } + x[j * xs0] = temp; + } + + return 0; +} + +/// +/// Upper, Non-Transpose +/// + +template +struct SerialTbsvInternalUpper { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalUpper::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = an - 1; j >= 0; --j) { + if (x[j * xs0] != 0) { + if (!use_unit_diag) x[j * xs0] = x[j * xs0] / A[k * as0 + j * as1]; + + auto temp = x[j * xs0]; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = j - 1; i >= Kokkos::max(0, j - k); --i) { + x[i * xs0] = x[i * xs0] - temp * A[(k - j + i) * as0 + j * as1]; + } + } + } + + return 0; +} + +/// +/// Upper, Transpose +/// + +template +struct SerialTbsvInternalUpperTranspose { + template + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, + const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int k); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int +SerialTbsvInternalUpperTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = 0; j < an; j++) { + auto temp = x[j * xs0]; + if (do_conj) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::max(0, j - k); i < j; ++i) { + temp -= Kokkos::ArithTraits::conj( + A[(i + k - j) * as0 + j * as1]) * + x[i * xs0]; + } + if (!use_unit_diag) + temp = + temp / Kokkos::ArithTraits::conj(A[k * as0 + j * as1]); + } else { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = Kokkos::max(0, j - k); i < j; ++i) { + temp -= A[(i + k - j) * as0 + j * as1] * x[i * xs0]; + } + if (!use_unit_diag) temp = temp / A[k * as0 + j * as1]; + } + x[j * xs0] = temp; + } + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/src/KokkosBatched_Tbsv.hpp b/batched/dense/src/KokkosBatched_Tbsv.hpp new file mode 100644 index 0000000000..7510c07969 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Tbsv.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_TBSV_HPP_ +#define KOKKOSBATCHED_TBSV_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Tbsv: +/// +/// Solve Ab_l x_l = b_l for all l = 0, ..., N +/// using the triangular solve algorithm Tbsv. Ab is an n by n unit, or +/// non-unit, upper or lower triangular band matrix, with ( k + 1 ) +/// diagonals. +/// +/// \tparam AViewType: Input type for the matrix, needs to be a 2D view +/// \tparam XViewType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param A [in]: A is a lda by n banded matrix, with ( k + 1 ) diagonals +/// \param X [inout]: right-hand side and the solution, a rank 1 view +/// \param k [in]: k specifies the number of superdiagonals or subdiagonals of +/// matrix A. k >= 0 +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialTbsv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, + const XViewType &X, const int k); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Tbsv_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_TBSV_HPP_ diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index cf9b3c23f4..7b0ee58312 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -42,6 +42,9 @@ #include "Test_Batched_SerialTrsv.hpp" #include "Test_Batched_SerialTrsv_Real.hpp" #include "Test_Batched_SerialTrsv_Complex.hpp" +#include "Test_Batched_SerialTbsv.hpp" +#include "Test_Batched_SerialTbsv_Real.hpp" +#include "Test_Batched_SerialTbsv_Complex.hpp" #include "Test_Batched_SerialTrtri.hpp" #include "Test_Batched_SerialTrtri_Real.hpp" #include "Test_Batched_SerialTrtri_Complex.hpp" diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index 6a96bd193a..689ff4f7a5 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -16,10 +16,12 @@ #ifndef TEST_BATCHED_DENSE_HELPER_HPP #define TEST_BATCHED_DENSE_HELPER_HPP +#include "KokkosBatched_Util.hpp" + namespace KokkosBatched { template -void create_tridiagonal_batched_matrices(const MatrixViewType &A, - const VectorViewType &B) { +void create_tridiagonal_batched_matrices(const MatrixViewType& A, + const VectorViewType& B) { Kokkos::Random_XorShift64_Pool< typename VectorViewType::device_type::execution_space> random(13718); @@ -54,6 +56,61 @@ void create_tridiagonal_batched_matrices(const MatrixViewType &A, Kokkos::fence(); } + +template +void create_banded_triangular_matrix(InViewType& in, OutViewType& out, + int k = 1, bool band_storage = true) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + + Kokkos::deep_copy(h_in, in); + if (band_storage) { + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == static_cast(k + 1)); + assert(out.extent(2) == in.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, k - i1, i2) = h_in(i0, i2 - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i2 + i1, i2); + } + } + } + } + } else { + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < Kokkos::min(i1 + k + 1, BlkSize); i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = Kokkos::max(0, i1 - k); i2 <= i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + } + } + } + } + } + Kokkos::deep_copy(out, h_out); +} } // namespace KokkosBatched #endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp new file mode 100644 index 0000000000..572e02053b --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp @@ -0,0 +1,349 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Tbsv.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Tbsv { + +template +struct ParamTag { + using uplo = U; + using trans = T; + using diag = D; +}; + +template +struct Functor_BatchedSerialTrsv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + + ScalarType _alpha; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialTrsv(const ScalarType alpha, const AViewType &a, + const BViewType &b) + : _a(a), _b(b), _alpha(alpha) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + KokkosBatched::SerialTrsv< + typename ParamTagType::uplo, typename ParamTagType::trans, + typename ParamTagType::diag, AlgoTagType>::invoke(_alpha, aa, bb); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialTbsv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialTbsv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + int _k; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialTbsv(const AViewType &a, const BViewType &b, const int k) + : _a(a), _b(b), _k(k) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + KokkosBatched::SerialTbsv< + typename ParamTagType::uplo, typename ParamTagType::trans, + typename ParamTagType::diag, AlgoTagType>::invoke(aa, bb, _k); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialTbsv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +/// \brief Implementation details of batched tbsv test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_tbsv(const int N, const int k, const int BlkSize) { + using execution_space = typename DeviceType::execution_space; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + // Reference is created by trsv from triangular matrix + View3DType A("A", N, BlkSize, BlkSize), Ref("Ref", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x1("x1", N, BlkSize); // Solutions + + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(Ref, rand_pool, randStart, randEnd); + Kokkos::fill_random(x0, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(x1, x0); + + // Create triangluar or banded matrix + create_banded_triangular_matrix(Ref, A, k, + false); + create_banded_triangular_matrix(Ref, Ab, k, + true); + + // Reference trsv + Functor_BatchedSerialTrsv(1.0, A, x0) + .run(); + + // tbsv + Functor_BatchedSerialTbsv(Ab, x1, k) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + using ats = typename Kokkos::ArithTraits; + using mag_type = typename ats::mag_type; + mag_type eps = 1.0e3 * ats::epsilon(); + + // Check x0 = x1 + auto h_x0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x0); + auto h_x1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x1); + for (int i = 0; i < N; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_x0(i, j), h_x1(i, j), eps); + } + } +} + +template +/// \brief Implementation details of batched tbsv test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +void impl_test_batched_tbsv_analytical(const std::size_t N) { + using execution_space = typename DeviceType::execution_space; + using View2DType = Kokkos::View; + using StridedView2DType = + Kokkos::View; + using View3DType = Kokkos::View; + + // Reference is created by trsv from triangular matrix + constexpr std::size_t BlkSize = 3, k = 2, incx = 2; + + View3DType A("A", N, BlkSize, BlkSize), ref("Ref", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x_ref("x_ref", N, BlkSize); // Solutions + + // Testing incx argument with strided Views + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx}; + StridedView2DType x1("x1", layout); // Solutions + + Kokkos::RangePolicy policy(0, N); + Kokkos::parallel_for( + "KokkosBatched::Test::SerialTbsv::Initialize", policy, + KOKKOS_LAMBDA(const std::size_t ib) { + for (std::size_t i = 0; i < BlkSize; i++) { + for (std::size_t j = 0; j < BlkSize; j++) { + ref(ib, i, j) = i + 1; + } + } + for (std::size_t j = 0; j < BlkSize; j++) { + x0(ib, j) = 1; + x1(ib, j) = 1; + } + + if (std::is_same_v) { + if (std::is_same_v) { + if (std::is_same_v) { + x_ref(ib, 0) = 1.0 / 2.0; + x_ref(ib, 1) = 1.0 / 6.0; + x_ref(ib, 2) = 1.0 / 3.0; + } else { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = -1.0; + x_ref(ib, 2) = 1.0; + } + } else { + if (std::is_same_v) { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = 0.0; + x_ref(ib, 2) = 0.0; + } else { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = 0.0; + x_ref(ib, 2) = 0.0; + } + } + } else { + if (std::is_same_v) { + if (std::is_same_v) { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = -1.0 / 2.0; + x_ref(ib, 2) = -1.0 / 6.0; + } else { + x_ref(ib, 0) = 1.0; + x_ref(ib, 1) = -1.0; + x_ref(ib, 2) = 1.0; + } + } else { + if (std::is_same_v) { + x_ref(ib, 0) = 0.0; + x_ref(ib, 1) = 0.0; + x_ref(ib, 2) = 1.0 / 3.0; + } else { + x_ref(ib, 0) = 2.0; + x_ref(ib, 1) = -2.0; + x_ref(ib, 2) = 1.0; + } + } + } + }); + + Kokkos::fence(); + + // Create triangluar or banded matrix + create_banded_triangular_matrix(ref, A, k, + false); + create_banded_triangular_matrix(ref, Ab, k, + true); + + // tbsv + Functor_BatchedSerialTbsv(Ab, x0, k) + .run(); + + // tbsv with incx == 2 + Functor_BatchedSerialTbsv(Ab, x1, k) + .run(); + + Kokkos::fence(); + + // Check x0 = x_ref and x1 = x_ref + // Firstly, prepare contiguous views on host + auto h_x0 = Kokkos::create_mirror_view(x0); + auto h_x1 = Kokkos::create_mirror_view(x0); + + Kokkos::deep_copy(h_x0, x0); + + // Pack x1 into x0 for contiguous storage + Kokkos::parallel_for( + "KokkosBatched::Test::SerialTbsv::Copy", policy, + KOKKOS_LAMBDA(const std::size_t ib) { + for (std::size_t j = 0; j < BlkSize; j++) { + x0(ib, j) = x1(ib, j); + } + }); + + Kokkos::fence(); + Kokkos::deep_copy(h_x1, x0); + + // this eps is about 10^-14 + using ats = typename Kokkos::ArithTraits; + using mag_type = typename ats::mag_type; + mag_type eps = 1.0e3 * ats::epsilon(); + + auto h_x_ref = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t j = 0; j < BlkSize; j++) { + // Check x0 = x_ref + EXPECT_NEAR_KK(h_x0(ib, j), h_x_ref(ib, j), eps); + + // Check x1 = x_ref + EXPECT_NEAR_KK(h_x1(ib, j), h_x_ref(ib, j), eps); + } + } +} + +} // namespace Tbsv +} // namespace Test + +template +int test_batched_tbsv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + for (int i = 0; i < 10; i++) { + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); + Test::Tbsv::impl_test_batched_tbsv_analytical< + DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + for (int i = 0; i < 10; i++) { + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp new file mode 100644 index 0000000000..8789cc6931 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp @@ -0,0 +1,120 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +// NO TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +// TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_t_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_t_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} + +/* [FIXME] These tests need Trans::ConjTranspose in trsv. +// CONJUGATE TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_ct_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_ct_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_ct_u_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_ct_n_dcomplex) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv, param_tag_type, + algo_tag_type>(); +} +*/ +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp new file mode 100644 index 0000000000..8915b4ad05 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp @@ -0,0 +1,137 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +// NO TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +// TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_t_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_t_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_u_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_n_float) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +// NO TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +// TRANSPOSE +TEST_F(TestCategory, batched_serial_tbsv_l_t_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_l_t_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_u_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +TEST_F(TestCategory, batched_serial_tbsv_u_t_n_double) { + using param_tag_type = + ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; + + test_batched_tbsv(); +} +#endif diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 50173538fb..ecb72e7c9a 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -116,6 +116,7 @@ struct Algo { using Gemv = Level2; using Trsv = Level2; using ApplyQ = Level2; + using Tbsv = Level2; }; namespace Impl { From 49b1d46fcb08e71e8f652cef8b488ac60d6c99d5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 21:56:18 +0200 Subject: [PATCH 278/326] Bump actions/checkout from 4.1.6 to 4.1.7 (#2248) Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.6 to 4.1.7. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/a5ac7e51b41094c92402da3b24376905380afc29...692973e3d937129bcbf40652eb9f2f61becf3332) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/bdw.yml | 4 ++-- .github/workflows/codeql.yml | 4 ++-- .github/workflows/dependency-review.yml | 2 +- .github/workflows/docs.yml | 4 ++-- .github/workflows/format.yml | 2 +- .github/workflows/h100.yml | 4 ++-- .github/workflows/mi210.yml | 4 ++-- .github/workflows/osx.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- .github/workflows/spr.yml | 4 ++-- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index f60008ab72..450a0975ab 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -188,12 +188,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 06328c83c1..073453d075 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -38,7 +38,7 @@ jobs: egress-policy: audit - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels @@ -52,7 +52,7 @@ jobs: # Prefix the list here with "+" to use these queries and those in the config file. - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: 'kokkos/kokkos' path: 'kokkos' diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index b911317970..1792f0181c 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -22,6 +22,6 @@ jobs: egress-policy: audit - name: 'Checkout Repository' - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: 'Dependency Review' uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 901e218fdc..9690446a4f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,12 +25,12 @@ jobs: doxygen --version - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 5517b68dbb..08b541587f 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Install Dependencies run: sudo apt install clang-format-8 diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml index 5fd01d972b..0d20177b96 100644 --- a/.github/workflows/h100.yml +++ b/.github/workflows/h100.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index 7b55f065bf..9735e405f1 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -107,12 +107,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 082467d614..fa23b5dd72 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -50,12 +50,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: 4.3.00 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 2885ca7fae..dee549daeb 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -38,7 +38,7 @@ jobs: egress-policy: audit - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: persist-credentials: false diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml index 8fe8053f5b..c38d04ac8d 100644 --- a/.github/workflows/spr.yml +++ b/.github/workflows/spr.yml @@ -26,12 +26,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} From fe2a92f9f20b37589c89180f9aa6ac18ee020926 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 21:57:00 +0200 Subject: [PATCH 279/326] Bump github/codeql-action from 3.25.8 to 3.25.10 (#2249) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.8 to 3.25.10. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/2e230e8fe0ad3a14a340ad0815ddb96d599d2aff...23acc5c183826b7a8a97bce3cecc52db901f8251) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 073453d075..7ed1a206a3 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,7 +44,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 + uses: github/codeql-action/init@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 + uses: github/codeql-action/analyze@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10 with: category: "/language:c-cpp" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index dee549daeb..32dcabc873 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 + uses: github/codeql-action/upload-sarif@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10 with: sarif_file: results.sarif From 819c40b84aa2ce7b6075eec50f6bb140dd2909a4 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 17 Jun 2024 16:02:15 -0600 Subject: [PATCH 280/326] FindTPLROC*: updates to fix export of import targets Changes for the Rocm tpls to match the handling as done with the Cuda tpls Should resolve issue #2238 --- cmake/Modules/FindTPLROCBLAS.cmake | 58 ++++++++++++++++++++++------ cmake/Modules/FindTPLROCSOLVER.cmake | 55 ++++++++++++++++++++++---- cmake/Modules/FindTPLROCSPARSE.cmake | 54 ++++++++++++++++++++++---- 3 files changed, 139 insertions(+), 28 deletions(-) diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index c0a9de3b50..4edcd82944 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,13 +1,47 @@ -# MPL: 12/29/2022: CMake regular way to find a package -FIND_PACKAGE(ROCBLAS) -if(TARGET roc::rocblas) -## MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention -## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) - SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas) - SET(TPL_IMPORTED_NAME roc::rocblas) -## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention -## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) - ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas) -ELSE() - MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") +IF(ROCBLAS_LIBRARIES AND ROCBLAS_LIBRARY_DIRS AND ROCBLAS_INCLUDE_DIRS) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES ${ROCBLAS_LIBRARIES} + LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS} + HEADER_PATHS ${ROCBLAS_INCLUDE_DIRS} + ) +ELSEIF(ROCBLAS_LIBRARIES AND ROCBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES ${ROCBLAS_LIBRARIES} + LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS} + HEADER rocblas.h + ) +ELSEIF(ROCBLAS_LIBRARIES) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES ${ROCBLAS_LIBRARIES} + HEADER rocblas.h + ) +ELSEIF(ROCBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES rocblas + LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS} + HEADER rocblas.h + ) +ELSEIF(ROCBLAS_ROOT OR KokkosKernels_ROCBLAS_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(ROCBLAS INTERFACE + LIBRARIES rocblas + HEADER rocblas.h + ) +ELSE() # backwards-compatible way + FIND_PACKAGE(ROCBLAS) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT ROCBLAS_FOUND) + #Important note here: this find Module is named TPLROCBLAS + #The eventual target is named ROCBLAS. To avoid naming conflicts + #the find module is called TPLROCBLAS. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) + ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE + LINK_LIBRARIES "${ROCBLAS_LIBRARIES}") + ENDIF() + ENDIF() ENDIF() diff --git a/cmake/Modules/FindTPLROCSOLVER.cmake b/cmake/Modules/FindTPLROCSOLVER.cmake index 8f2a92cfda..58eae9f8f5 100644 --- a/cmake/Modules/FindTPLROCSOLVER.cmake +++ b/cmake/Modules/FindTPLROCSOLVER.cmake @@ -1,9 +1,48 @@ -# LBV: 11/08/2023: This file follows the partern of FindTPLROCBLAS.cmake/FindTPLROCSPARSE.cmake -FIND_PACKAGE(ROCSOLVER) -if(TARGET roc::rocsolver) - SET(TPL_ROCSOLVER_IMPORTED_NAME roc::rocsolver) - SET(TPL_IMPORTED_NAME roc::rocsolver) - ADD_LIBRARY(KokkosKernels::ROCSOLVER ALIAS roc::rocsolver) -ELSE() - MESSAGE(FATAL_ERROR "Package ROCSOLVER requested but not found") +IF(ROCSOLVER_LIBRARIES AND ROCSOLVER_LIBRARY_DIRS AND ROCSOLVER_INCLUDE_DIRS) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES ${ROCSOLVER_LIBRARIES} + LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS} + HEADER_PATHS ${ROCSOLVER_INCLUDE_DIRS} + ) +ELSEIF(ROCSOLVER_LIBRARIES AND ROCSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES ${ROCSOLVER_LIBRARIES} + LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS} + HEADER rocsolver.h + ) +ELSEIF(ROCSOLVER_LIBRARIES) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES ${ROCSOLVER_LIBRARIES} + HEADER rocsolver.h + ) +ELSEIF(ROCSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES rocsolver + LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS} + HEADER rocsolver.h + ) +ELSEIF(ROCSOLVER_ROOT OR KokkosKernels_ROCSOLVER_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(ROCSOLVER INTERFACE + LIBRARIES rocsolver + HEADER rocsolver.h + ) +ELSE() # backwards-compatible way + FIND_PACKAGE(ROCSOLVER) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT ROCSOLVER_FOUND) + #Important note here: this find Module is named TPLROCSOLVER + #The eventual target is named ROCSOLVER. To avoid naming conflicts + #the find module is called TPLROCSOLVER. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSOLVER REQUIRED_VARS ROCSOLVER_FOUND) + ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCSOLVER_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSOLVER REQUIRED_VARS ROCSOLVER_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSOLVER INTERFACE + LINK_LIBRARIES "${ROCSOLVER_LIBRARIES}") + ENDIF() + ENDIF() ENDIF() + diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index 5f985ff3a8..3b45ba5e82 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -1,9 +1,47 @@ -# MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake -FIND_PACKAGE(ROCSPARSE) -if(TARGET roc::rocsparse) - SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) - SET(TPL_IMPORTED_NAME roc::rocsparse) - ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse) -ELSE() - MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") +IF(ROCSPARSE_LIBRARIES AND ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_INCLUDE_DIRS) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES ${ROCSPARSE_LIBRARIES} + LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS} + HEADER_PATHS ${ROCSPARSE_INCLUDE_DIRS} + ) +ELSEIF(ROCSPARSE_LIBRARIES AND ROCSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES ${ROCSPARSE_LIBRARIES} + LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS} + HEADER rocsparse.h + ) +ELSEIF(ROCSPARSE_LIBRARIES) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES ${ROCSPARSE_LIBRARIES} + HEADER rocsparse.h + ) +ELSEIF(ROCSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES rocsparse + LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS} + HEADER rocsparse.h + ) +ELSEIF(ROCSPARSE_ROOT OR KokkosKernels_ROCSPARSE_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(ROCSPARSE INTERFACE + LIBRARIES rocsparse + HEADER rocsparse.h + ) +ELSE() # backwards-compatible way + FIND_PACKAGE(ROCSPARSE) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT ROCSPARSE_FOUND) + #Important note here: this find Module is named TPLROCSPARSE + #The eventual target is named ROCSPARSE. To avoid naming conflicts + #the find module is called TPLROCSPARSE. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) + ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE + LINK_LIBRARIES "${ROCSPARSE_LIBRARIES}") + ENDIF() + ENDIF() ENDIF() From 47942bf7c87c470275c5bb221e4a52eefa989e40 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 24 Jun 2024 08:58:55 -0600 Subject: [PATCH 281/326] Fix warning about memcpy (#2252) When building Stokhos BlockCrs, this util function gave a warning about memcpy modifying a non-trivially-copyable type. Silence it by casting to void* --- common/src/KokkosKernels_BlockUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/KokkosKernels_BlockUtils.hpp b/common/src/KokkosKernels_BlockUtils.hpp index 006a38a6e4..6fd9d9b656 100644 --- a/common/src/KokkosKernels_BlockUtils.hpp +++ b/common/src/KokkosKernels_BlockUtils.hpp @@ -39,7 +39,7 @@ template KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, value_type *dst, const value_type *val) { - memcpy(dst, val, block_dim * block_dim * sizeof(value_type)); + memcpy((void *)dst, val, block_dim * block_dim * sizeof(value_type)); } // Performs A += B on blocks From e9f7913d9ca017c297f15ebd9156f985efb52f13 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 24 Jun 2024 22:23:30 -0600 Subject: [PATCH 282/326] RCM fixes, improvements (#2254) * Fix RCM starting vertex issue, improve testing * apply reversing as labels are computed instead of at the end. Saves a loop over all the labels * use min-degree starting vertex within each connected component --- graph/impl/KokkosGraph_BFS_impl.hpp | 57 +++++------ graph/unit_test/Test_Graph_rcm.hpp | 145 ++++++++++++++++++++++------ 2 files changed, 148 insertions(+), 54 deletions(-) diff --git a/graph/impl/KokkosGraph_BFS_impl.hpp b/graph/impl/KokkosGraph_BFS_impl.hpp index e73c1cb489..9ea5d63e07 100644 --- a/graph/impl/KokkosGraph_BFS_impl.hpp +++ b/graph/impl/KokkosGraph_BFS_impl.hpp @@ -38,7 +38,7 @@ struct SerialRCM { host_lno_view_t entries; SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) - : numVerts(rowmap_.extent(0) - 1), + : numVerts(std::max(rowmap_.extent_int(0), 1) - 1), rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"), rowmap_.extent(0)), entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"), @@ -47,35 +47,39 @@ struct SerialRCM { Kokkos::deep_copy(entries, entries_); } - lno_t findPseudoPeripheral() { - // Choose vertex with smallest degree - lno_t periph = -1; - lno_t periphDeg = numVerts; - for (lno_t i = 0; i < numVerts; i++) { - lno_t deg = rowmap(i + 1) - rowmap(i); - if (deg < periphDeg) { - periph = i; - periphDeg = deg; - if (deg == 0) break; - } - } - return periph; - } - lno_view_t rcm() { - lno_t start = findPseudoPeripheral(); + // Given a label L, labelReverse - L gives the reversed label (as in reverse + // Cuthill McKee) + lno_t labelReverse = numVerts - 1; host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"), numVerts); host_lno_view_t label( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"), numVerts); for (lno_t i = 0; i < numVerts; i++) label(i) = -1; - lno_t qhead = 0; - lno_t qtail = 0; - label(start) = qtail; + lno_t qhead = 0; + lno_t qtail = 0; + // List of all vertices, in order from lowest to highest degree + // (heuristic for best to worst starting vertex for RCM). + // If the graph has multiple connected components, restart at the first + // unlabeled vertex in this list. + host_lno_view_t allVertices( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "allVertices"), + numVerts); + for (lno_t i = 0; i < numVerts; i++) allVertices(i) = i; + std::sort(allVertices.data(), allVertices.data() + numVerts, + [&](lno_t n1, lno_t n2) -> bool { + // return true if n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < + (rowmap(n2 + 1) - rowmap(n2)); + }); + lno_t allVerticesIter = 0; + // Start RCM with the first vertex in allVertices + lno_t start = allVertices(allVerticesIter++); + label(start) = labelReverse - qtail; q(qtail++) = start; + // Reuse this neighbor list for all levels without deallocating std::vector neighbors; - lno_t outerQueue = 0; while (true) { lno_t v = q(qhead++); neighbors.clear(); @@ -94,7 +98,7 @@ struct SerialRCM { }); // label and enqueue all unlabeled neighbors for (lno_t nei : neighbors) { - label(nei) = qtail; + label(nei) = labelReverse - qtail; q(qtail++) = nei; } if (qtail == numVerts) { @@ -102,16 +106,15 @@ struct SerialRCM { break; } else if (qhead == qtail) { // have exhausted this connected component, but others remain unlabeled - while (label(outerQueue) != -1) outerQueue++; - label(outerQueue) = qtail; - q(qtail++) = outerQueue; + while (label(allVertices(allVerticesIter)) != -1) allVerticesIter++; + lno_t restart = allVertices(allVerticesIter); + label(restart) = labelReverse - qtail; + q(qtail++) = restart; } } lno_view_t labelOut( Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"), numVerts); - // reverse the labels - for (lno_t i = 0; i < numVerts; i++) label(i) = numVerts - label(i) - 1; Kokkos::deep_copy(labelOut, label); return labelOut; } diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp index 2e05554d2d..a6d165d8c3 100644 --- a/graph/unit_test/Test_Graph_rcm.hpp +++ b/graph/unit_test/Test_Graph_rcm.hpp @@ -19,7 +19,7 @@ #include "KokkosGraph_RCM.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosSparse_CrsMatrix.hpp" +#include "Kokkos_StaticCrsGraph.hpp" #include @@ -81,7 +81,7 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm) { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; - lno_t numVerts = rowmap.extent(0) - 1; + lno_t numVerts = std::max(1, rowmap.extent_int(0)) - 1; int bw = 0; for (lno_t i = 0; i < numVerts; i++) { lno_t origRow = perm(i); @@ -97,18 +97,10 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, return bw; } -template -void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type rowmap_t; - typedef typename graph_t::entries_type entries_t; - lno_t numVerts = gridX * gridY * gridZ; - typename rowmap_t::non_const_type rowmap; - typename entries_t::non_const_type entries; - generate7pt(rowmap, entries, gridX, gridY, gridZ); +template +void test_rcm(const rowmap_t& rowmap, const entries_t& entries, + bool expectBandwidthReduced) { + using lno_t = typename entries_t::non_const_value_type; auto rcm = KokkosGraph::Experimental::graph_rcm( rowmap, entries); auto rowmapHost = @@ -116,6 +108,7 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); + lno_t numVerts = std::max(rowmap.extent_int(0), 1) - 1; decltype(rcmHost) rcmPermHost( Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts); for (lno_t i = 0; i < numVerts; i++) rcmPermHost(rcmHost(i)) = i; @@ -130,21 +123,119 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { } for (lno_t i = 0; i < numVerts; i++) ASSERT_EQ(counts[i], 1); } - Kokkos::View identityOrder( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts); - for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i; - size_t origBW = - maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); - size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); - EXPECT_LE(rcmBW, origBW); + if (expectBandwidthReduced) { + Kokkos::View identityOrder( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts); + for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i; + size_t origBW = + maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); + size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); + EXPECT_LE(rcmBW, origBW); + } +} + +template +void test_rcm_zerorows() { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + rowmap_t rowmap; + entries_t entries; + test_rcm(rowmap, entries, false); +} + +template +void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, + bool expectBandwidthReduced) { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + rowmap_t rowmap; + entries_t entries; + generate7pt(rowmap, entries, gridX, gridY, gridZ); + test_rcm(rowmap, entries, expectBandwidthReduced); +} + +template +void test_rcm_4clique() { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + rowmap_t rowmap("rowmap", 5); + entries_t entries("entries", 16); + auto rowmap_host = Kokkos::create_mirror_view(rowmap); + auto entries_host = Kokkos::create_mirror_view(entries); + for (lno_t i = 0; i < 5; i++) rowmap_host(i) = i * 4; + for (lno_t i = 0; i < 16; i++) entries_host(i) = i % 4; + Kokkos::deep_copy(rowmap, rowmap_host); + Kokkos::deep_copy(entries, entries_host); + test_rcm(rowmap, entries, false); +} + +template +void test_rcm_multiple_components() { + using graph_t = + Kokkos::StaticCrsGraph; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + // Generate a single 3D grid first + rowmap_t rowmap_cube; + entries_t entries_cube; + generate7pt(rowmap_cube, entries_cube, 7, 7, 7); + auto rowmap_cube_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap_cube); + auto entries_cube_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries_cube); + lno_t nv_cube = 7 * 7 * 7; + lno_t ne_cube = entries_cube.extent(0); + // Now replicate the graph twice, so there are 2 disconnected copies of the + // cube + rowmap_t rowmap("rowmap", nv_cube * 2 + 1); + entries_t entries("entries", ne_cube * 2); + auto rowmap_host = Kokkos::create_mirror_view(rowmap); + auto entries_host = Kokkos::create_mirror_view(entries); + for (lno_t i = 0; i <= nv_cube * 2; i++) { + if (i < nv_cube) + rowmap_host(i) = rowmap_cube_host(i); + else + rowmap_host(i) = ne_cube + rowmap_cube_host(i - nv_cube); + } + for (lno_t i = 0; i < ne_cube * 2; i++) { + if (i < ne_cube) + entries_host(i) = entries_cube_host(i); + else + entries_host(i) = nv_cube + entries_cube_host(i - ne_cube); + } + Kokkos::deep_copy(rowmap, rowmap_host); + Kokkos::deep_copy(entries, entries_host); + test_rcm(rowmap, entries, true); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - graph##_##rcm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm(6, 3, 3); \ - test_rcm(20, 20, 20); \ - test_rcm(100, 100, 1); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + graph##_##rcm_zerorows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_zerorows(); \ + } \ + TEST_F(TestCategory, \ + graph##_##rcm_7pt##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_7pt(1, 1, 1, false); \ + test_rcm_7pt(2, 1, 1, false); \ + test_rcm_7pt(6, 3, 3, true); \ + test_rcm_7pt(20, 20, 20, true); \ + test_rcm_7pt(100, 100, 1, true); \ + } \ + TEST_F(TestCategory, \ + graph##_##rcm_4clique##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_4clique(); \ + } \ + TEST_F( \ + TestCategory, \ + graph##_##rcm_multiple_components##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_multiple_components(); \ } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ From 31be65899841af53928451311658b8c5a7500831 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 25 Jun 2024 08:09:07 -0600 Subject: [PATCH 283/326] spgemm: add profiling regions to native implementations (#2253) * spgemm: add profiling regions to native implementations * Add profiling region to KokkosSPGEMM::KokkosSPGEMM_symbolic --- sparse/impl/KokkosSparse_spgemm_impl_def.hpp | 2 ++ sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp | 2 ++ sparse/impl/KokkosSparse_spgemm_impl_seq.hpp | 5 +++++ sparse/impl/KokkosSparse_spgemm_impl_speed.hpp | 2 ++ 4 files changed, 11 insertions(+) diff --git a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index a420a81c90..54e4e228c8 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -59,6 +59,7 @@ void KokkosSPGEMM::KokkosSPGEMM_symbolic(c_row_view_t rowmapC_) { + Kokkos::Profiling::pushRegion("KokkosSparse::spgemm_symbolic[NATIVE]"); { if (KOKKOSKERNELS_VERBOSE) { std::cout << "SYMBOLIC PHASE" << std::endl; @@ -162,6 +163,7 @@ void KokkosSPGEMMget_spgemm_handle()->set_c_nnz(result_index); Kokkos::deep_copy(row_mapC, h_rmc); Kokkos::fence(); + Kokkos::Profiling::popRegion(); } template Date: Fri, 28 Jun 2024 09:35:40 -0600 Subject: [PATCH 284/326] sparse: replace macros with constexpr bools (#2260) --- .../impl/KokkosSparse_spadd_numeric_impl.hpp | 61 +++--- .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 34 +-- sparse/src/KokkosSparse_par_ilut.hpp | 195 +++++++++--------- 3 files changed, 149 insertions(+), 141 deletions(-) diff --git a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp index fa356dc963..16c228d8ec 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp @@ -169,10 +169,11 @@ struct UnsortedNumericSumFunctor { const CcolindsT Bpos; }; -// Helper macro to check that two types are the same (ignoring const) -#define SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value +// Two types are the same (ignoring const) +template +constexpr bool spadd_numeric_same_type = + std::is_same_v, + typename std::remove_const_t>; template < typename execution_space, typename KernelHandle, typename alno_row_view_t, @@ -193,46 +194,56 @@ void spadd_numeric_impl( typedef typename KernelHandle::nnz_scalar_t scalar_type; // Check that A/B/C data types match KernelHandle types, and that C data types // are nonconst (doesn't matter if A/B types are const) - static_assert(SAME_TYPE(ascalar_t, scalar_type), + static_assert(spadd_numeric_same_type, "A scalar type must match handle scalar type"); - static_assert(SAME_TYPE(bscalar_t, scalar_type), + static_assert(spadd_numeric_same_type, "B scalar type must match handle scalar type"); - static_assert(SAME_TYPE(typename alno_row_view_t::value_type, size_type), - "add_symbolic: A size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_row_view_t::value_type, size_type), - "add_symbolic: B size_type must match KernelHandle size_type " - "(const doesn't matter)"); static_assert( - SAME_TYPE(typename clno_row_view_t::non_const_value_type, size_type), + spadd_numeric_same_type, + "add_symbolic: A size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + spadd_numeric_same_type, + "add_symbolic: B size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + spadd_numeric_same_type, "add_symbolic: C size_type must match KernelHandle size_type)"); - static_assert(SAME_TYPE(typename alno_nnz_view_t::value_type, ordinal_type), + static_assert(spadd_numeric_same_type, "add_symbolic: A entry type must match KernelHandle entry type " "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_nnz_view_t::value_type, ordinal_type), + static_assert(spadd_numeric_same_type, "add_symbolic: B entry type must match KernelHandle entry type " "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename clno_nnz_view_t::value_type, ordinal_type), + static_assert(spadd_numeric_same_type, "add_symbolic: C entry type must match KernelHandle entry type " "(aka nnz_lno_t)"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "add_symbolic: C entry type must not be const"); static_assert( - SAME_TYPE(typename ascalar_nnz_view_t::value_type, scalar_type), + spadd_numeric_same_type, "add_symbolic: A scalar type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); static_assert( - SAME_TYPE(typename bscalar_nnz_view_t::value_type, scalar_type), + spadd_numeric_same_type, "add_symbolic: B scalar type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); static_assert( - SAME_TYPE(typename cscalar_nnz_view_t::value_type, scalar_type), + spadd_numeric_same_type, "add_symbolic: C scalar type must match KernelHandle entry type (aka " "nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C scalar type must not be const"); + static_assert( + std::is_same_v, + "add_symbolic: C scalar type must not be const"); typedef Kokkos::RangePolicy range_type; auto addHandle = kernel_handle->get_spadd_handle(); // rowmap length can be 0 or 1 if #rows is 0. @@ -269,8 +280,6 @@ void spadd_numeric_impl( addHandle->set_call_numeric(); } -#undef SAME_TYPE - } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 80506e3056..764d185f90 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -24,10 +24,11 @@ namespace KokkosSparse { namespace Impl { -// Helper macro to check that two types are the same (ignoring const) -#define SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value +// Two types are the same (ignoring const) +template +constexpr bool spadd_symbolic_same_type = + std::is_same_v, + typename std::remove_const_t>; // get C rowmap for sorted input template , "add_symbolic: A size_type must match KernelHandle size_type (const " "doesn't matter)"); static_assert( - SAME_TYPE(typename blno_row_view_t_::non_const_value_type, size_type), + spadd_symbolic_same_type, "add_symbolic: B size_type must match KernelHandle size_type (const " "doesn't matter)"); static_assert( - SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type), + spadd_symbolic_same_type, "add_symbolic: C size_type must match KernelHandle size_type)"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "add_symbolic: C size_type must not be const"); static_assert( - SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type), + spadd_symbolic_same_type, "add_symbolic: A entry type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); static_assert( - SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type), + spadd_symbolic_same_type, "add_symbolic: B entry type must match KernelHandle entry type (aka " "nnz_lno_t, and const doesn't matter)"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "add_symbolic: C entry type must not be const"); // symbolic just needs to compute c_rowmap // easy for sorted, but for unsorted is easiest to just compute the whole sum @@ -594,8 +600,6 @@ void spadd_symbolic_impl( addHandle->set_call_numeric(false); } -#undef SAME_TYPE - } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 8ded6209ec..edaae8192f 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -44,9 +44,11 @@ namespace KokkosSparse { namespace Experimental { -#define KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value +// Two types are the same (ignoring const) +template +constexpr bool parilut_same_type = + std::is_same_v, + typename std::remove_const_t>; /// @brief Performs the symbolic phase of par_ilut. /// This is a non-blocking function. @@ -78,24 +80,24 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename ARowMapType::non_const_value_type, size_type), - "par_ilut_symbolic: A size_type must match KernelHandle " - "size_type (const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename AEntriesType::non_const_value_type, ordinal_type), + static_assert( + parilut_same_type, + "par_ilut_symbolic: A size_type must match KernelHandle " + "size_type (const doesn't matter)"); + static_assert(parilut_same_type, "par_ilut_symbolic: A entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LRowMapType::non_const_value_type, size_type), - "par_ilut_symbolic: L size_type must match KernelHandle " - "size_type (const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_symbolic: L size_type must match KernelHandle " + "size_type (const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename URowMapType::non_const_value_type, size_type), - "par_ilut_symbolic: U size_type must match KernelHandle " - "size_type (const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_symbolic: U size_type must match KernelHandle " + "size_type (const doesn't matter)"); static_assert(Kokkos::is_view::value, "par_ilut_symbolic: A_rowmap is not a Kokkos::View."); @@ -118,25 +120,25 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, "par_ilut_symbolic: A_rowmap, L_rowmap and U_rowmap must all " "have rank 1."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: The output L_rowmap must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: The output U_rowmap must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: Views LRowMapType and ARowMapType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_symbolic: Views LRowMapType and URowMapType have " "different device_types."); static_assert( - std::is_same< + std::is_same_v< typename LRowMapType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_symbolic: KernelHandle and Views have different execution " "spaces."); @@ -165,26 +167,26 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, typename ARowMapType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename ARowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using AEntries_Internal = Kokkos::View< typename AEntriesType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout< AEntriesType>::array_layout, typename AEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LRowMap_Internal = Kokkos::View< typename LRowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename LRowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using URowMap_Internal = Kokkos::View< typename URowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename URowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; ARowMap_Internal A_rowmap_i = A_rowmap; AEntries_Internal A_entries_i = A_entries; @@ -240,46 +242,43 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, using scalar_type = typename KernelHandle::nnz_scalar_t; static_assert( - KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename ARowMapType::non_const_value_type, size_type), + parilut_same_type, "par_ilut_numeric: A size_type must match KernelHandle size_type " "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename AEntriesType::non_const_value_type, ordinal_type), + static_assert(parilut_same_type, "par_ilut_numeric: A entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename AValuesType::value_type, scalar_type), - "par_ilut_numeric: A scalar type must match KernelHandle entry " - "type (aka nnz_scalar_t, and const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_numeric: A scalar type must match KernelHandle entry " + "type (aka nnz_scalar_t, and const doesn't matter)"); static_assert( - KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LRowMapType::non_const_value_type, size_type), + parilut_same_type, "par_ilut_numeric: L size_type must match KernelHandle size_type " "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LEntriesType::non_const_value_type, ordinal_type), + static_assert(parilut_same_type, "par_ilut_numeric: L entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename LValuesType::value_type, scalar_type), - "par_ilut_numeric: L scalar type must match KernelHandle entry " - "type (aka nnz_scalar_t, and const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_numeric: L scalar type must match KernelHandle entry " + "type (aka nnz_scalar_t, and const doesn't matter)"); static_assert( - KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename URowMapType::non_const_value_type, size_type), + parilut_same_type, "par_ilut_numeric: U size_type must match KernelHandle size_type " "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename UEntriesType::non_const_value_type, ordinal_type), + static_assert(parilut_same_type, "par_ilut_numeric: U entry type must match KernelHandle entry " "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE( - typename UValuesType::value_type, scalar_type), - "par_ilut_numeric: U scalar type must match KernelHandle entry " - "type (aka nnz_scalar_t, and const doesn't matter)"); + static_assert( + parilut_same_type, + "par_ilut_numeric: U scalar type must match KernelHandle entry " + "type (aka nnz_scalar_t, and const doesn't matter)"); static_assert(Kokkos::is_view::value, "par_ilut_numeric: A_rowmap is not a Kokkos::View."); @@ -330,73 +329,71 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, "par_ilut_numeric: A_values, L_values and U_values must all " "have rank 1."); - static_assert( - std::is_same::value, - "par_ilut_numeric: The output L_entries must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, + "par_ilut_numeric: The output L_entries must be nonconst."); + static_assert(std::is_same_v, "par_ilut_numeric: The output L_values must be nonconst."); - static_assert( - std::is_same::value, - "par_ilut_numeric: The output U_entries must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, + "par_ilut_numeric: The output U_entries must be nonconst."); + static_assert(std::is_same_v, "par_ilut_numeric: The output U_values must be nonconst."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LRowMapType and ARowMapType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LEntriesType and AEntriesType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LValuesType and AValuesType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LRowMapType and URowMapType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LEntriesType and UEntriesType have " "different device_types."); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "par_ilut_numeric: Views LValuesType and UValuesType have " "different device_types."); static_assert( - std::is_same< + std::is_same_v< typename LRowMapType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_numeric: KernelHandle and Views have different execution " "spaces."); static_assert( - std::is_same< + std::is_same_v< typename LEntriesType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_numeric: KernelHandle and Views have different execution " "spaces."); static_assert( - std::is_same< + std::is_same_v< typename LValuesType::device_type::execution_space, - typename KernelHandle::PAR_ILUTHandleType::execution_space>::value, + typename KernelHandle::PAR_ILUTHandleType::execution_space>, "par_ilut_numeric: KernelHandle and Views have different execution " "spaces."); static_assert( - std::is_same::value, + std::is_same_v, "par_ilut_numeric: rowmap and entries have different device types."); static_assert( - std::is_same::value, + std::is_same_v, "par_ilut_numeric: rowmap and values have different device types."); // Check if symbolic has been called @@ -431,58 +428,58 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, typename ARowMapType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename ARowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using AEntries_Internal = Kokkos::View< typename AEntriesType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout< AEntriesType>::array_layout, typename AEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using AValues_Internal = Kokkos::View< typename AValuesType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename AValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LRowMap_Internal = Kokkos::View< typename LRowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename LRowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LEntries_Internal = Kokkos::View::array_layout, typename LEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using LValues_Internal = Kokkos::View< typename LValuesType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename LValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using URowMap_Internal = Kokkos::View< typename URowMapType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename URowMapType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using UEntries_Internal = Kokkos::View::array_layout, typename UEntriesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; using UValues_Internal = Kokkos::View< typename UValuesType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename UValuesType::device_type, - Kokkos::MemoryTraits >; + Kokkos::MemoryTraits>; ARowMap_Internal A_rowmap_i = A_rowmap; AEntries_Internal A_entries_i = A_entries; @@ -519,6 +516,4 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, } // namespace Experimental } // namespace KokkosSparse -#undef KOKKOSKERNELS_PAR_ILUT_SAME_TYPE - #endif // KOKKOSSPARSE_PAR_ILUT_HPP_ From 41954e20d6811b5c21918bb195c8b6dd833c7736 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 28 Jun 2024 12:44:58 -0600 Subject: [PATCH 285/326] Rename `Impl::alignPtr` to `Impl::alignPtrTo`, allow it to infer argument type (#2261) * KokkosKernels::Impl::alignPtr infers argument type * Rename KokkosKernels::Impl::alignPtr -> alignPtrTo --- common/src/KokkosKernels_Utils.hpp | 4 ++-- sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 12 +++++------- sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp | 2 +- sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp | 15 ++++++--------- sparse/impl/KokkosSparse_spgemm_impl_speed.hpp | 2 +- .../KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp | 12 +++++------- 6 files changed, 20 insertions(+), 27 deletions(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index ba8049cecf..45aa8132bc 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -1527,8 +1527,8 @@ struct array_sum_reduce { } }; -template -KOKKOS_INLINE_FUNCTION T *alignPtr(InPtr p) { +template +KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) { // ugly but computationally free and the "right" way to do this in C++ std::uintptr_t ptrVal = reinterpret_cast(p); // ptrVal + (align - 1) lands inside the next valid aligned scalar_t, diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp index 6eb9044733..a36200b295 100644 --- a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -270,8 +270,7 @@ struct KokkosBSPGEMM(tmp); + scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo(tmp); BlockAccumulator hm(block_dim, pow2_hash_size, pow2_hash_func, nullptr, nullptr, hash_ids, hash_values); @@ -414,7 +413,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); BlockAccumulator hm(block_dim, thread_shmem_key_size, thread_shared_memory_hash_func, begins, nexts, keys, @@ -554,7 +553,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); @@ -601,8 +600,7 @@ struct KokkosBSPGEMM( - tmp + pow2_hash_size); + KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); } // initialize begins. { @@ -885,7 +883,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp index bc1b378558..22111d3752 100644 --- a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -325,7 +325,7 @@ struct KokkosBSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); KokkosKernels::Experimental::BlockHashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, diff --git a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index a7fee71278..8fb2711cdf 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -261,8 +261,7 @@ struct KokkosSPGEMM(tmp); + scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo(tmp); Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), @@ -409,8 +408,7 @@ struct KokkosSPGEMM(tmp); + hm2.values = KokkosKernels::Impl::alignPtrTo(tmp); Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), @@ -498,7 +496,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); KokkosKernels::Experimental::HashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, @@ -639,7 +637,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); @@ -686,8 +684,7 @@ struct KokkosSPGEMM( - tmp + pow2_hash_size); + KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); } // initialize begins. { @@ -970,7 +967,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 954bfb3f3e..e19f5b7bc5 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -304,7 +304,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); KokkosKernels::Experimental::HashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index f638b76b9b..d48d297e2d 100644 --- a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -260,8 +260,7 @@ struct KokkosSPGEMM(tmp); + scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo(tmp); Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), @@ -452,7 +451,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); // Create the hashmaps KokkosKernels::Experimental::HashmapAccumulator< @@ -610,7 +609,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); int vector_rank = 0; @@ -826,7 +825,7 @@ struct KokkosSPGEMM(all_shared_memory); + KokkosKernels::Impl::alignPtrTo(all_shared_memory); int thread_rank = teamMember.team_rank(); int vector_rank = 0; @@ -871,8 +870,7 @@ struct KokkosSPGEMM( - tmp + pow2_hash_size); + KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); nnz_lno_t num_threads = pow2_hash_size / vector_size; Kokkos::parallel_for( From 1df84bb48ede8a61ba949e844745842b0b3ba11f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 08:08:46 -0600 Subject: [PATCH 286/326] Bump github/codeql-action from 3.25.10 to 3.25.11 (#2263) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.10 to 3.25.11. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/23acc5c183826b7a8a97bce3cecc52db901f8251...b611370bb5703a7efb587f9d136a52ea24c5c38c) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 7ed1a206a3..e1f8aa51f8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,7 +44,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10 + uses: github/codeql-action/init@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10 + uses: github/codeql-action/analyze@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 with: category: "/language:c-cpp" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 32dcabc873..4396ad1cdb 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10 + uses: github/codeql-action/upload-sarif@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 with: sarif_file: results.sarif From 0a0c04836c0ffd7cb85ab17b3f1e34ae3c919bff Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 1 Jul 2024 09:01:14 -0600 Subject: [PATCH 287/326] sparse: spadd_symbolic fences before device values used on host (#2259) * sparse: spadd_symbolic fences before device values used on host * sparse: use prefix sum to remove explicit spadd fence --- sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 764d185f90..9744e75ec3 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -546,9 +546,7 @@ void spadd_symbolic_impl( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", range_type(exec, 0, nrows), countEntries); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - exec, nrows + 1, c_rowmap_upperbound); - Kokkos::deep_copy(exec, c_nnz_upperbound, - Kokkos::subview(c_rowmap_upperbound, nrows)); + exec, nrows + 1, c_rowmap_upperbound, c_nnz_upperbound); } ordinal_view_t c_entries_uncompressed( Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, @@ -595,6 +593,7 @@ void spadd_symbolic_impl( // provide the number of NNZ in C to user through handle size_type cmax; Kokkos::deep_copy(exec, cmax, Kokkos::subview(c_rowmap, nrows)); + exec.fence("fence before cmax used on host"); addHandle->set_c_nnz(cmax); addHandle->set_call_symbolic(); addHandle->set_call_numeric(false); From cfcde6777bc490271d113f7fc9599039e009aa74 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 1 Jul 2024 16:56:04 -0600 Subject: [PATCH 288/326] sycl: use alternative `alignPtrTo` when SYCL is enabled (SpGEMM) (#2262) * sycl: use alternative alignPtr when SYCL is enabled The current alignPtr, as well as two other alternatives below, do not work on SYCL on Intel PVC. unsigned int f1(unsigned int i, unsigned int align) { return ((i + align - 1) / align * align); } unsigned int f2(unsigned int i, unsigned int align) { return (i + align - 1) & (-align); } * alignPtrTo unit tests --- common/src/KokkosKernels_Utils.hpp | 27 ++++ common/unit_test/Test_Common.hpp | 1 + common/unit_test/Test_Common_AlignPtrTo.hpp | 166 ++++++++++++++++++++ 3 files changed, 194 insertions(+) create mode 100644 common/unit_test/Test_Common_AlignPtrTo.hpp diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 45aa8132bc..89aeabb823 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -1527,6 +1527,32 @@ struct array_sum_reduce { } }; +/* Several alternatives were considered for SYCL, including + +unsigned int f1(unsigned int i, unsigned int align) +{ + return ((i + align - 1) / align * align); +} + +unsigned int f2(unsigned int i, unsigned int align) +{ + return (i + align - 1) & (-align); +} + +f1 should be equivalent to the below, but it produces incorrect results on SYCL +f2 is how GCC does std::align, but it also produces incorrect results on SYCL +possibly alignof(T) is not a power-of-2 on SYCL? Or a compiler error. +*/ +#if defined(KOKKOS_ENABLE_SYCL) +template +KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + while (ptrVal % alignof(T)) { + ++ptrVal; + } + return reinterpret_cast(ptrVal); +} +#else template KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) { // ugly but computationally free and the "right" way to do this in C++ @@ -1535,6 +1561,7 @@ KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) { // and the mask produces the start of that scalar_t. return reinterpret_cast((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1))); } +#endif } // namespace Impl } // namespace KokkosKernels diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index 2ccf9c2103..fb93a494d6 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -16,6 +16,7 @@ #ifndef TEST_COMMON_HPP #define TEST_COMMON_HPP +#include #include // #include #include diff --git a/common/unit_test/Test_Common_AlignPtrTo.hpp b/common/unit_test/Test_Common_AlignPtrTo.hpp new file mode 100644 index 0000000000..f60887cd80 --- /dev/null +++ b/common/unit_test/Test_Common_AlignPtrTo.hpp @@ -0,0 +1,166 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file + +This test file was motivated by an observation in the SpGEMM on SYCL that +strange values were coming out of the pointer alignment functions, causing +Kokkos::atomic_add to be a no-op or write 0. The Kokkos Kernels alignPtrTo +function was updated with the one of four implementations that was observed to +work on SYCL (even though all four in here should be okay.) + +TEST_FN 0-3 are various implemetations, and TEST_FN 4 is testing Kokkos Kernels +implementation. The tests are written to PASS for the observed SYCL behavor - +i.e., that TEST_FN 1,4 produce aligned pointers, and the others do not (even +though they should). If the other functions start working on SYCL, then this +test will "fail", and the Kokkos Kernels implementation should be updated with +one of the now-working (and faster?) implementations. +*/ + +#ifndef TEST_COMMON_ALIGNPTRTO_HPP +#define TEST_COMMON_ALIGNPTRTO_HPP + +#include +#include +#include + +namespace { + +// the original Kokkos Kernels implementation +template +KOKKOS_INLINE_FUNCTION T *f0(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + return reinterpret_cast((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1))); +} + +// an implementation that works for SYCL +template +KOKKOS_INLINE_FUNCTION T *f1(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + while (ptrVal % alignof(T)) { + ++ptrVal; + } + return reinterpret_cast(ptrVal); +} + +// another valid implementation +template +KOKKOS_INLINE_FUNCTION T *f2(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + return reinterpret_cast((ptrVal + alignof(T) - 1) / alignof(T) * + alignof(T)); +} + +// the way GCC does it (roughly) +template +KOKKOS_INLINE_FUNCTION T *f3(InPtr p) { + std::uintptr_t ptrVal = reinterpret_cast(p); + return reinterpret_cast((ptrVal - uint64_t(1) + alignof(T)) & + -alignof(T)); +} + +// Function to be executed by each team +template +struct TeamFunction { + TeamFunction() = default; + TeamFunction(const Results &results) : results_(results) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const Team &team) const { + // get an "aligned" pointer to scratch memory + char *shmem = (char *)(team.team_shmem().get_shmem(team.team_size() * + sizeof(double))); + double *vals; + if constexpr (0 == TEST_FN) { + vals = f0(shmem); + } else if constexpr (1 == TEST_FN) { + vals = f1(shmem); + } else if constexpr (2 == TEST_FN) { + vals = f2(shmem); + } else if constexpr (3 == TEST_FN) { + vals = f3(shmem); + } else if constexpr (4 == TEST_FN) { + vals = KokkosKernels::Impl::alignPtrTo(shmem); + } else { + static_assert(std::is_void_v, "Unexpected test function"); + } + + const size_t i = team.team_rank(); + double val = team.team_rank(); + vals[i] = 0; // zero shared memory + Kokkos::atomic_add(&vals[i], val); +#if 0 // debugging + Kokkos::printf("%s:%i result(%lu) += %f yielded %f\n", __FILE__, __LINE__, i, val, vals[i]); +#endif + + results_(i) = vals[i]; + } + + size_t team_shmem_size(int team_size) const { + return team_size * sizeof(double); + } + + Results results_; +}; + +// use atomic add to set result(i) = i +template +void test_alignPtrTo() { + using MemorySpace = typename Device::memory_space; + using ExecSpace = typename Device::execution_space; + using TestView = Kokkos::View; + using TestPolicy = Kokkos::TeamPolicy; + const int teamSize = TestPolicy(1, Kokkos::AUTO) + .team_size_max(TeamFunction(), + Kokkos::ParallelForTag{}); + + ExecSpace space; + + TestView results("TestView", teamSize); + TestPolicy policy(space, 1, teamSize); + Kokkos::parallel_for("test alignment", policy, + TeamFunction(results)); + + int errs; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, teamSize), + KOKKOS_LAMBDA(int i, int &lerr) { lerr += (results(i) != i); }, errs); + +// if SYCL is enabled, only TEST_FN 1 and 4 should work +#if defined(KOKKOS_ENABLE_SYCL) + if constexpr (std::is_same_v) { + if constexpr ((1 == TEST_FN) || (4 == TEST_FN)) { + EXPECT_EQ(0, errs); + } else { + EXPECT_NE(0, errs); + } + } else { + EXPECT_EQ(0, errs); + } +#else + EXPECT_EQ(0, errs); +#endif +} + +TEST_F(TestCategory, common_AlignPtrTo_0) { test_alignPtrTo<0, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_1) { test_alignPtrTo<1, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_2) { test_alignPtrTo<2, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_3) { test_alignPtrTo<3, TestDevice>(); } +TEST_F(TestCategory, common_AlignPtrTo_kk) { test_alignPtrTo<4, TestDevice>(); } + +} // anonymous namespace + +#endif // TEST_COMMON_ALIGNPTRTO From 6d7e977ba93906b37ceaa07314052d902e863547 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 2 Jul 2024 13:06:44 -0600 Subject: [PATCH 289/326] Help gcc/8.3 with ctad issue Resolves #2264 Co-authored-by: Carl Pearson --- common/unit_test/Test_Common_AlignPtrTo.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unit_test/Test_Common_AlignPtrTo.hpp b/common/unit_test/Test_Common_AlignPtrTo.hpp index f60887cd80..760cddd5a2 100644 --- a/common/unit_test/Test_Common_AlignPtrTo.hpp +++ b/common/unit_test/Test_Common_AlignPtrTo.hpp @@ -136,7 +136,7 @@ void test_alignPtrTo() { int errs; Kokkos::parallel_reduce( - Kokkos::RangePolicy(space, 0, teamSize), + Kokkos::RangePolicy(space, 0, teamSize), KOKKOS_LAMBDA(int i, int &lerr) { lerr += (results(i) != i); }, errs); // if SYCL is enabled, only TEST_FN 1 and 4 should work From 8130cf9ab45db5b2bebad4586c9827a18b2f6bcb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 08:59:56 -0600 Subject: [PATCH 290/326] Bump actions/upload-artifact from 4.3.3 to 4.3.4 (#2266) Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.3 to 4.3.4. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/65462800fd760344b1a7b4382951275a0abb4808...0b2256b8c012f0828dc542b3febcab082c67f72b) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 4396ad1cdb..bf06213b40 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -65,7 +65,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4 with: name: SARIF file path: results.sarif From 48e941b0b483522762e97a41ce6e9efea1677499 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 9 Jul 2024 10:43:09 -0600 Subject: [PATCH 291/326] handle_t* -> unique_ptr in Bsr SpMV unit tests (#2269) --- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index e9b23298f9..699afb2510 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -383,9 +383,9 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, using handle_t = SPMVHandle; // cover a variety of algorithms - std::vector handles; + std::vector> handles; for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) - handles.push_back(new handle_t(algo)); + handles.push_back(std::make_unique(algo)); // Tensor core algorithm temporarily disabled, fails on V100 /* @@ -405,14 +405,14 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, } */ - for (handle_t *handle : handles) { + for (std::unique_ptr &handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spmv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); if (beta == scalar_type(0)) { - test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, + test_spmv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x_with_nans, y_with_nans); } } @@ -644,9 +644,9 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, SPMVHandle; // cover a variety of algorithms - std::vector handles; + std::vector> handles; for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) - handles.push_back(new handle_t(algo)); + handles.push_back(std::make_unique(algo)); // Tensor core algorithm temporarily disabled, fails on V100 /* @@ -670,14 +670,15 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); auto [x_with_nans, y_with_nans] = random_multivecs_for_spm_mv(mode, a, numVecs, true); - for (handle_t *handle : handles) { + for (std::unique_ptr &handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spm_mv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x, + y); if (beta == scalar_type(0)) { - test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, + test_spm_mv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x_with_nans, y_with_nans); } } From e1cd832e135628ef27248e368ad4d29d4a8c37e6 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 9 Jul 2024 13:07:25 -0600 Subject: [PATCH 292/326] Workarounds for removed cusparse functions (#2270) cusparse 12.5 removed some functions that were deprecated, like the ILU factorizations and the legacy csrsv (sparse triangular solve) functions. As a workaround, if the cusparse version is >= 12.5 then disable the paths in perftests that call those. --- perf_test/sparse/KokkosSparse_spiluk.cpp | 56 ++++++------------ perf_test/sparse/KokkosSparse_sptrsv_aux.hpp | 62 ++++++++++++-------- 2 files changed, 56 insertions(+), 62 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index c85b126019..95dcc78ab1 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -24,7 +24,13 @@ #include #include // std::setprecision -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +// cuSPARSE ILU and IC factorizations were removed +// completely in cuSPARSE 12.5 +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUSPARSE_VERSION < 12500) +#define USE_CUSPARSE_ILU +#endif + +#ifdef USE_CUSPARSE_ILU #include #endif @@ -39,8 +45,6 @@ #include #include -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \ - (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) using namespace KokkosSparse; using namespace KokkosSparse::Experimental; using namespace KokkosKernels; @@ -52,8 +56,8 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, int team_size, int /*vector_length*/, /*int idx_offset,*/ int loop) { typedef default_scalar scalar_t; - typedef default_lno_t lno_t; - typedef default_size_type size_type; + typedef int lno_t; + typedef int size_type; typedef Kokkos::DefaultExecutionSpace execution_space; typedef typename execution_space::memory_space memory_space; @@ -82,6 +86,11 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "\n\n" << std::endl; if (!afilename.empty()) { +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && !defined(USE_CUSPARSE_ILU) + std::cout << "** Note: cuSPARSE is enabled, but the cusparseXcsrilu*\n"; + std::cout << " functions were removed in cuSPARSE 12.5.\n"; + std::cout << " Only KokkosKernels spiluk will be run.\n\n"; +#endif std::cout << "ILU(K) Begin: Read matrix filename " << afilename << std::endl; crsmat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix( @@ -91,11 +100,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, const int nnz = A.nnz(); const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(kin); -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU // std::cout << " cusparse: create handle" << std::endl; cusparseStatus_t status; cusparseHandle_t handle = 0; @@ -131,10 +136,6 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaMalloc((void **)&pBuffer, pBufferSize); -#else - std::cout << "Note: the cuSPARSE TPL is enabled, but either offset=int or " - "ordinal=int is disabled, so it can't be used.\n"; -#endif #endif for (auto test : tests) { @@ -223,11 +224,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "nrm2(A*e-L*U*e) = " << std::setprecision(15) << bb_nrm << std::endl; -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU if (fill_lev == 0) { std::cout << "CUSPARSE: No KK interface added yet" << std::endl; @@ -383,7 +380,6 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, } // end row std::cout << "ILU(0) SUCCESS!" << std::endl; } // fill_lev=0 -#endif #endif // Benchmark @@ -407,11 +403,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MAX_TIME: " << max_time << std::endl; std::cout << "LOOP_MIN_TIME: " << min_time << std::endl; -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU if (fill_lev == 0) { lno_view_t A_row_map("A_row_map", nrows + 1); lno_nnz_view_t A_entries("A_entries", nnz); @@ -441,21 +433,15 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MAX_TIME (cuSPARSE): " << max_time << std::endl; std::cout << "LOOP_MIN_TIME (cuSPARSE): " << min_time << std::endl; } // fill_lev=0 -#endif #endif } // end tests -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) +#ifdef USE_CUSPARSE_ILU // step 6: free resources cudaFree(pBuffer); cusparseDestroyCsrilu02Info(info); cusparseDestroyMatDescr(descr); cusparseDestroy(handle); -#endif #endif } // end if (!afilename.empty()) @@ -583,9 +569,3 @@ int main(int argc, char **argv) { Kokkos::finalize(); return 0; } -#else -int main() { - std::cout << "The SPILUK perf_test requires CUDA >= 8.0\n"; - return 0; -} -#endif diff --git a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp index 65120a8827..6b9c244da3 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp @@ -228,25 +228,37 @@ std::string getCuSparseErrorString(cusparseStatus_t status) { /* ========================================================================================= */ #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +#if CUSPARSE_VERSION >= 12500 +template +bool check_cusparse(host_crsmat_t &, bool, crsmat_t &, bool, crsmat_t &, int *, + int *, double, int) { + // TODO: call KokkosSparse::sptrsv (if hardcoded problem settings below are + // compatible), or add wrappers for modern interface (cusparseSpSV*) + throw std::logic_error("Legacy cuSPARSE csrsv interface not available."); + return false; +} + +#else + template bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_majorU, crsmat_t &U, int *perm_r, int *perm_c, double tol, int loop) { using values_view_t = typename crsmat_t::values_type::non_const_type; - using scalar_t = typename values_view_t::value_type; - using size_type = typename crsmat_t::size_type; + using scalar_t = typename values_view_t::value_type; + using size_type = typename crsmat_t::size_type; using host_values_view_t = typename host_crsmat_t::values_type::non_const_type; using execution_space = typename values_view_t::execution_space; - using memory_space = typename execution_space::memory_space; + using memory_space = typename execution_space::memory_space; using host_execution_space = typename host_values_view_t::execution_space; - using host_memory_space = typename host_execution_space::memory_space; + using host_memory_space = typename host_execution_space::memory_space; using host_scalar_view_t = Kokkos::View; - using scalar_view_t = Kokkos::View; + using scalar_view_t = Kokkos::View; const scalar_t ZERO(0.0); const scalar_t ONE(1.0); @@ -258,7 +270,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // > create a handle cusparseStatus_t status; cusparseHandle_t handle = 0; - status = cusparseCreate(&handle); + status = cusparseCreate(&handle); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreate failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -269,7 +281,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // > create a empty info structure for L-solve (e.g., analysis results) csrsv2Info_t infoL = 0; - status = cusparseCreateCsrsv2Info(&infoL); + status = cusparseCreateCsrsv2Info(&infoL); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateCsrsv2Info failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -279,14 +291,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // Preparing for L-solve // step 1: create a descriptor size_type nnzL = L.nnz(); - auto graphL = L.graph; // in_graph - auto row_mapL = graphL.row_map; - auto entriesL = graphL.entries; - auto valuesL = L.values; + auto graphL = L.graph; // in_graph + auto row_mapL = graphL.row_map; + auto entriesL = graphL.entries; + auto valuesL = L.values; // NOTE: it is stored in CSC = UPPER + TRANSPOSE cusparseMatDescr_t descrL = 0; - status = cusparseCreateMatDescr(&descrL); + status = cusparseCreateMatDescr(&descrL); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateMatDescr failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -300,7 +312,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // step 2: query how much memory used in csrsv2, and allocate the buffer // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. int pBufferSize; - void *pBufferL = 0; + void *pBufferL = 0; cusparseOperation_t transL = (col_majorL ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE); if (std::is_same::value) { @@ -374,14 +386,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, timer.reset(); if (std::is_same::value) { const double alpha = 1.0; - status = cusparseDcsrsv2_solve( + status = cusparseDcsrsv2_solve( handle, transL, nrows, nnzL, &alpha, descrL, reinterpret_cast(valuesL.data()), row_mapL.data(), entriesL.data(), infoL, reinterpret_cast(rhs.data()), reinterpret_cast(sol.data()), policy, pBufferL); } else { const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); - status = cusparseZcsrsv2_solve( + status = cusparseZcsrsv2_solve( handle, transL, nrows, nnzL, &alpha, descrL, reinterpret_cast(valuesL.data()), row_mapL.data(), entriesL.data(), infoL, reinterpret_cast(rhs.data()), @@ -404,14 +416,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // ============================================== // Preparing for U-solve size_type nnzU = U.nnz(); - auto graphU = U.graph; // in_graph - auto row_mapU = graphU.row_map; - auto entriesU = graphU.entries; - auto valuesU = U.values; + auto graphU = U.graph; // in_graph + auto row_mapU = graphU.row_map; + auto entriesU = graphU.entries; + auto valuesU = U.values; // > create a empty info structure for U-solve (e.g., analysis results) csrsv2Info_t infoU = 0; - status = cusparseCreateCsrsv2Info(&infoU); + status = cusparseCreateCsrsv2Info(&infoU); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateCsrsv2Info failed with " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -420,7 +432,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // ============================================== // step 1: create a descriptor cusparseMatDescr_t descrU = 0; - status = cusparseCreateMatDescr(&descrU); + status = cusparseCreateMatDescr(&descrU); if (CUSPARSE_STATUS_SUCCESS != status) { std::cout << " ** cusparseCreateMatDescr create status error name " << getCuSparseErrorString(status) << " ** " << std::endl; @@ -438,7 +450,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, // ============================================== // step 2: query how much memory used in csrsv2, and allocate the buffer // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. - void *pBufferU = 0; + void *pBufferU = 0; cusparseOperation_t transU = (col_majorU ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE); if (std::is_same::value) { @@ -485,14 +497,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, timer.reset(); if (std::is_same::value) { const double alpha = 1.0; - status = cusparseDcsrsv2_solve( + status = cusparseDcsrsv2_solve( handle, transU, nrows, nnzU, &alpha, descrU, reinterpret_cast(valuesU.data()), row_mapU.data(), entriesU.data(), infoU, reinterpret_cast(sol.data()), reinterpret_cast(rhs.data()), policy, pBufferU); } else { const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); - status = cusparseZcsrsv2_solve( + status = cusparseZcsrsv2_solve( handle, transU, nrows, nnzU, &alpha, descrU, reinterpret_cast(valuesU.data()), row_mapU.data(), entriesU.data(), infoU, reinterpret_cast(sol.data()), @@ -652,6 +664,8 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, } return success; } +#endif + #else template bool check_cusparse(host_crsmat_t & /*Mtx*/, bool /*col_majorL*/, From ea430c3f558b812b47d6bfc70965005acc9652a2 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 9 Jul 2024 21:45:44 -0600 Subject: [PATCH 293/326] BLAS - gemv: using fallback when mode is 't' or 'c' and onemkl is used (#2272) --- blas/src/KokkosBlas2_gemv.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 614b48d47a..e68f2cca75 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -163,9 +163,11 @@ void gemv(const ExecutionSpace& space, const char trans[], #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #ifdef KOKKOS_ENABLE_SYCL // oneMKL supports both row-major and column-major of A + // but only supports oneapi::mkl::transpose::nontrans op useFallback = - useFallback || !std::is_same_v; + useFallback || ((tolower(*trans) == 't' || tolower(*trans) == 'c') && + std::is_same_v); #endif #endif From 994891a23207e5ebbbb81ddbb5f02d90343a2606 Mon Sep 17 00:00:00 2001 From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:38:55 +0200 Subject: [PATCH 294/326] Implement batched serial pttrf (#2256) * Batched serial pttrf implementation * fix: use GEMM to add matrices * fix: initialization order * fformat * fix: temporary variable in a test code * fix: docstring of pttrf * check_positive_definitiveness only if KOKKOSKERNELS_DEBUG_LEVEL > 0 * Improve the test for pttrf * fix: int type * fix: cleanup tests for SerialPttrf * cleanup: remove unused deep_copies * fix: docstrings and comments for pttrf * ConjTranspose with conj and Transpose * quick return in pttrf for size 1 or 0 matrix * Add tests for invalid input * fix: info computation --------- Co-authored-by: Yuuichi Asahi --- .../impl/KokkosBatched_Pttrf_Serial_Impl.hpp | 73 +++ .../KokkosBatched_Pttrf_Serial_Internal.hpp | 211 ++++++++ batched/dense/src/KokkosBatched_Pttrf.hpp | 52 ++ .../dense/unit_test/Test_Batched_Dense.hpp | 3 + .../unit_test/Test_Batched_DenseUtils.hpp | 40 ++ .../unit_test/Test_Batched_SerialPttrf.hpp | 467 ++++++++++++++++++ .../Test_Batched_SerialPttrf_Complex.hpp | 31 ++ .../Test_Batched_SerialPttrf_Real.hpp | 31 ++ blas/impl/KokkosBlas_util.hpp | 1 + 9 files changed, 909 insertions(+) create mode 100644 batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp create mode 100644 batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp create mode 100644 batched/dense/src/KokkosBatched_Pttrf.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp new file mode 100644 index 0000000000..b0ea39fa3f --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp @@ -0,0 +1,73 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pttrf_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPttrfInput( + [[maybe_unused]] const DViewType &d, [[maybe_unused]] const EViewType &e) { + static_assert(Kokkos::is_view::value, + "KokkosBatched::pttrf: DViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::pttrf: EViewType is not a Kokkos::View."); + + static_assert(DViewType::rank == 1, + "KokkosBatched::pttrf: DViewType must have rank 1."); + static_assert(EViewType::rank == 1, + "KokkosBatched::pttrf: EViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int nd = d.extent(0); + const int ne = e.extent(0); + + if (ne + 1 != nd) { + Kokkos::printf( + "KokkosBatched::pttrf: Dimensions of d and e do not match: d: %d, e: " + "%d \n" + "e.extent(0) must be equal to d.extent(0) - 1\n", + nd, ne); + return 1; + } +#endif + return 0; +} + +template <> +struct SerialPttrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, + const EViewType &e) { + // Quick return if possible + if (d.extent(0) == 0) return 0; + if (d.extent(0) == 1) return (d(0) < 0 ? 1 : 0); + + auto info = checkPttrfInput(d, e); + if (info) return info; + + return SerialPttrfInternal::invoke( + d.extent(0), d.data(), d.stride(0), e.data(), e.stride(0)); + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp new file mode 100644 index 0000000000..5b4d3fb182 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp @@ -0,0 +1,211 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +struct SerialPttrfInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, + ValueType *KOKKOS_RESTRICT d, + const int ds0, + ValueType *KOKKOS_RESTRICT e, + const int es0); + + template + KOKKOS_INLINE_FUNCTION static int invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0); +}; + +/// +/// Real matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + ValueType *KOKKOS_RESTRICT e, const int es0) { + int info = 0; + + auto update = [&](const int i) { + auto ei_tmp = e[i * es0]; + e[i * es0] = ei_tmp / d[i * ds0]; + d[(i + 1) * ds0] -= e[i * es0] * ei_tmp; + }; + + auto check_positive_definitiveness = [&](const int i) { + return (d[i] <= 0.0) ? (i + 1) : 0; + }; + + // Compute the L*D*L' (or U'*D*U) factorization of A. + const int i4 = (n - 1) % 4; + for (int i = 0; i < i4; i++) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + } // for (int i = 0; i < i4; i++) + + for (int i = i4; i < n - 4; i += 4) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 1); + if (info) { + return info; + } +#endif + + update(i + 1); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 2); + if (info) { + return info; + } +#endif + + update(i + 2); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 3); + if (info) { + return info; + } +#endif + + update(i + 3); + + } // for (int i = i4; i < n-4; 4) + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(n - 1); + if (info) { + return info; + } +#endif + + return 0; +} + +/// +/// Complex matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0) { + int info = 0; + + auto update = [&](const int i) { + auto eir_tmp = e[i * es0].real(); + auto eii_tmp = e[i * es0].imag(); + auto f_tmp = eir_tmp / d[i * ds0]; + auto g_tmp = eii_tmp / d[i * ds0]; + e[i * es0] = Kokkos::complex(f_tmp, g_tmp); + d[(i + 1) * ds0] = d[(i + 1) * ds0] - f_tmp * eir_tmp - g_tmp * eii_tmp; + }; + + auto check_positive_definitiveness = [&](const int i) { + return (d[i] <= 0.0) ? (i + 1) : 0; + }; + + // Compute the L*D*L' (or U'*D*U) factorization of A. + const int i4 = (n - 1) % 4; + for (int i = 0; i < i4; i++) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + } // for (int i = 0; i < i4; i++) + + for (int i = i4; i < n - 4; i += 4) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 1); + if (info) { + return info; + } +#endif + + update(i + 1); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 2); + if (info) { + return info; + } +#endif + + update(i + 2); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 3); + if (info) { + return info; + } +#endif + + update(i + 3); + + } // for (int i = i4; i < n-4; 4) + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(n - 1); + if (info) { + return info; + } +#endif + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/src/KokkosBatched_Pttrf.hpp b/batched/dense/src/KokkosBatched_Pttrf.hpp new file mode 100644 index 0000000000..4fcc944dc8 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Pttrf.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_HPP_ +#define KOKKOSBATCHED_PTTRF_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pttrf: +/// Compute the Cholesky factorization L*D*L**T (or L*D*L**H) of a real +/// symmetric (or complex Hermitian) positive definite tridiagonal matrix A_l +/// for all l = 0, ..., N +/// +/// \tparam DViewType: Input type for the a diagonal matrix, needs to be a 1D +/// view +/// \tparam EViewType: Input type for the a upper/lower diagonal matrix, +/// needs to be a 1D view +/// +/// \param d [inout]: n diagonal elements of the diagonal matrix D +/// \param e [inout]: n-1 upper/lower diagonal elements of the diagonal matrix E +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPttrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, + const EViewType &e); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pttrf_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PTTRF_HPP_ diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 7b0ee58312..76215b58f8 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -49,6 +49,9 @@ #include "Test_Batched_SerialTrtri_Real.hpp" #include "Test_Batched_SerialTrtri_Complex.hpp" #include "Test_Batched_SerialSVD.hpp" +#include "Test_Batched_SerialPttrf.hpp" +#include "Test_Batched_SerialPttrf_Real.hpp" +#include "Test_Batched_SerialPttrf_Complex.hpp" // Team Kernels #include "Test_Batched_TeamAxpy.hpp" diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index 689ff4f7a5..c1328291fb 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -111,6 +111,46 @@ void create_banded_triangular_matrix(InViewType& in, OutViewType& out, } Kokkos::deep_copy(out, h_out); } + +/// \brief Create a diagonal matrix from an input vector: +/// Copies the input vector into the diagonal of the output matrix specified +/// by the parameter k. k > 0 means that the matrix is upper-diagonal and +/// k < 0 means the lower-diagonal. k = 0 means the diagonal. +/// +/// \tparam InViewType: Input type for the vector, needs to be a 2D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched vector, a rank 2 view +/// \param out [out]: Output batched matrix, where the diagonal compnent +/// specified by k is filled with the input vector, a rank 3 view +/// \param k [in]: The diagonal offset to be filled (default is 0). +/// +template +void create_diagonal_matrix(InViewType& in, OutViewType& out, int k = 0) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == in.extent(1) + abs(k)); + + int i1_start = k >= 0 ? 0 : -k; + int i2_start = k >= 0 ? k : 0; + + // Zero clear the output matrix + using ScalarType = typename OutViewType::non_const_value_type; + Kokkos::deep_copy(h_out, ScalarType(0.0)); + + Kokkos::deep_copy(h_in, in); + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + h_out(i0, i1 + i1_start, i1 + i2_start) = h_in(i0, i1); + } + } + + Kokkos::deep_copy(out, h_out); +} + } // namespace KokkosBatched #endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp new file mode 100644 index 0000000000..6ee7818ddc --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp @@ -0,0 +1,467 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pttrf.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pttrf { + +template +struct Functor_BatchedSerialPttrf { + using execution_space = typename DeviceType::execution_space; + DViewType _d; + EViewType _e; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) + : _d(d), _e(e) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k, int &info) const { + auto dd = Kokkos::subview(_d, k, Kokkos::ALL()); + auto ee = Kokkos::subview(_e, k, Kokkos::ALL()); + + info += KokkosBatched::SerialPttrf::invoke(dd, ee); + } + + inline int run() { + using value_type = typename DViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _d.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, + const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, + _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pttrf test for random matrix +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), + A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), + L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d("d", N, BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e_upper("e_upper", N, BlkSize - 1), + e_lower("e_lower", N, + BlkSize - 1); // upper and lower diagonal components + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + RealType realRandStart, realRandEnd; + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, realRandStart, realRandEnd); + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + + // Add BlkSize to ensure positive definiteness + Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, + realRandEnd + BlkSize); + Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd); + + auto h_e_upper = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); + auto h_e_lower = Kokkos::create_mirror_view(e_lower); + + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize - 1; i++) { + // Fill the lower diagonal with conjugate of the upper diagonal + h_e_lower(ib, i) = + Kokkos::ArithTraits::conj(h_e_upper(ib, i)); + } + } + + Kokkos::deep_copy(e_lower, h_e_lower); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiagonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(e_upper, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + .run(); + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + auto info = Functor_BatchedSerialPttrf(d, e_lower) + .run(); + + Kokkos::fence(); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + EXPECT_EQ(info, 0); +#endif + + // Reconstruct L and D from factorized matrix + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(d, D); + + // Copy I to L + Kokkos::deep_copy(L, I); + + // EL + I by EL * I + L (result stored in L) + Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + .run(); + + // Reconstruct A by L*D*L**H + // Gemm to compute L*D -> LD + Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + .run(); + + // FIXME: We should use SerialGemm Trans::ConjTranspose. + // For the moment, we compute the complex conjugate of L and + // then use Trans::Transpose. + // Gemm to compute (L*D)*L**H -> A_reconst + // Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + // A_reconst) + // .run(); + + // Compute the complex conjugate of L + // L -> conj(L) + auto h_L = Kokkos::create_mirror_view(L); + Kokkos::deep_copy(h_L, L); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_L(ib, i, j) = Kokkos::ArithTraits::conj(h_L(ib, i, j)); + } + } + } + Kokkos::deep_copy(L, h_L); + + // Gemm to compute (L*D)*(conj(L))**T -> A_reconst + Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + A_reconst) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check A = L*D*L**H + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +template +/// \brief Implementation details of batched pttrf test for early return +/// BlkSize must be 0 or 1 +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + + if (BlkSize > 1) return; + + const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0; + + RealView2DType d("d", N, BlkSize), + d2("d2", N, BlkSize); // Diagonal components + View2DType e("e", N, + BlkSize_minus_1); // lower diagonal components + + const RealType reference_value = 4.0; + + Kokkos::deep_copy(d, reference_value); + Kokkos::deep_copy(d2, -reference_value); + Kokkos::deep_copy(e, ScalarType(1.0)); + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + // Early return if BlkSize is 0 or 1 + auto info = Functor_BatchedSerialPttrf(d, e) + .run(); + + // For negative values, info should be 1 for BlkSize = 1 + auto info2 = Functor_BatchedSerialPttrf(d2, e) + .run(); + + Kokkos::fence(); + + int expected_info2 = BlkSize == 0 ? 0 : N; + EXPECT_EQ(info, 0); + EXPECT_EQ(info2, expected_info2); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_d = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d); + auto h_d2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d2); + + // Check if d is unchanged + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_d(ib, i), reference_value, eps); + EXPECT_NEAR_KK(h_d2(ib, i), -reference_value, eps); + } + } +} + +template +/// \brief Implementation details of batched pttrf test +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), + A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), + L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N, + BlkSize - 1); // Upper and lower diagonal components (identical) + + Kokkos::deep_copy(d, RealType(4.0)); + Kokkos::deep_copy(e, ScalarType(1.0)); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiaonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e, EL, -1); + create_diagonal_matrix(e, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + .run(); + + // Factorize matrix A -> L * D * L**T + // d and e are updated by pttrf + auto info = Functor_BatchedSerialPttrf(d, e) + .run(); + + Kokkos::fence(); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + EXPECT_EQ(info, 0); +#endif + + // Reconstruct L and D from factorized matrix + create_diagonal_matrix(e, EL, -1); + create_diagonal_matrix(d, D); + + // Copy I to L + Kokkos::deep_copy(L, I); + + // EL + I by EL * I + L (result stored in L) + Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + .run(); + + // Reconstruct A by L*D*L**T + // Gemm to compute L*D -> LD + Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + .run(); + + // Gemm to compute (L*D)*L**T -> A_reconst + Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + A_reconst) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check A = L*D*L.T + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +} // namespace Pttrf +} // namespace Test + +template +int test_batched_pttrf() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + for (int i = 0; i < 2; i++) { + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + for (int i = 0; i < 2; i++) { + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 2, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp new file mode 100644 index 0000000000..febccc5cb3 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp @@ -0,0 +1,31 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pttrf_fcomplex) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pttrf_dcomplex) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp new file mode 100644 index 0000000000..8b0fb658fe --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp @@ -0,0 +1,31 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pttrf_float) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pttrf_double) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index ecb72e7c9a..1fc6b7d480 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -85,6 +85,7 @@ struct Algo { using SolveLU = Level3; using QR = Level3; using UTV = Level3; + using Pttrf = Level3; struct Level2 { struct Unblocked {}; From d310f1aa8c4b2bc7513d34eff0286190c11460ab Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 10 Jul 2024 14:29:55 -0600 Subject: [PATCH 295/326] A little sptrsv cleanup before the main block effort (#2247) * Some cleanup and refactoring * First round of cleanup complete * Fix a couple warnings * formatting --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 26 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 6557 ++++++++--------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 39 +- sparse/src/KokkosKernels_Handle.hpp | 6 +- sparse/src/KokkosSparse_sptrsv_handle.hpp | 134 +- 5 files changed, 3267 insertions(+), 3495 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 415ccf87a0..3caa2bcc31 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -47,20 +47,18 @@ struct IlukWrap { // // Useful types // - using execution_space = typename IlukHandle::execution_space; - using memory_space = typename IlukHandle::memory_space; - using lno_t = typename IlukHandle::nnz_lno_t; - using size_type = typename IlukHandle::size_type; - using scalar_t = typename IlukHandle::nnz_scalar_t; - using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t; - using HandleDeviceValueType = typename IlukHandle::nnz_value_view_t; - using WorkViewType = typename IlukHandle::work_view_t; - using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; - using LevelViewType = typename IlukHandle::nnz_lno_view_t; - using karith = typename Kokkos::ArithTraits; - using team_policy = typename IlukHandle::TeamPolicy; - using member_type = typename team_policy::member_type; - using range_policy = typename IlukHandle::RangePolicy; + using execution_space = typename IlukHandle::execution_space; + using memory_space = typename IlukHandle::memory_space; + using lno_t = typename IlukHandle::nnz_lno_t; + using size_type = typename IlukHandle::size_type; + using scalar_t = typename IlukHandle::nnz_scalar_t; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + using LevelViewType = typename IlukHandle::nnz_lno_view_t; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename IlukHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename IlukHandle::RangePolicy; static team_policy get_team_policy(const size_type nrows, const int team_size) { diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index a64a4d23bc..d385a390cd 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -27,15 +27,11 @@ #include #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV - // Enable supernodal sptrsv #include "KokkosBlas3_trsm.hpp" #include "KokkosSparse_spmv.hpp" - #include "KokkosBatched_Util.hpp" - #include "KokkosBlas2_team_gemv_spec.hpp" - #include "KokkosBatched_Trsm_Team_Impl.hpp" #endif @@ -48,834 +44,997 @@ #include "cuda_profiler_api.h" #endif -namespace KokkosSparse { -namespace Impl { -namespace Experimental { - #if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \ defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH) #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT #endif -struct UnsortedTag {}; - -struct LargerCutoffTag {}; - -struct UnsortedLargerCutoffTag {}; - -template -void print_view1d_solve(const ViewType dv, size_t range = 0) { - auto v = Kokkos::create_mirror_view(dv); - Kokkos::deep_copy(v, dv); - std::cout << "Output for view " << v.label() << std::endl; - range = range == 0 ? dv.extent(0) : range; - for (size_t i = 0; i < range; ++i) { - std::cout << "v(" << i << ") = " << v(i) << " , "; - } - std::cout << std::endl; -} - -// Needed for cudagraphs -struct EmptyFunctor { - KOKKOS_INLINE_FUNCTION - void operator()(const int) const {} -}; - -// This functor unifies the lower and upper implementations, the hope is the -// "is_lowertri" check does not add noticable time on larger problems -template -struct TriLvlSchedTP1SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - const bool is_lowertri; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - - TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - const bool &is_lowertri_, const long &node_count_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), - node_count(node_count_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } - } -}; - -template -struct TriLvlSchedTP1SolverFunctorDiagValues { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - ValuesType diagonal_values; // inserted according to rowid - - const bool is_lowertri; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long dense_nrows; - - TriLvlSchedTP1SolverFunctorDiagValues(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, - LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - const ValuesType &diagonal_values_, - const bool is_lowertri_, - long node_count_, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - diagonal_values(diagonal_values_), - is_lowertri(is_lowertri_), - node_count(node_count_), - dense_nrows(dense_nrows_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - team.team_barrier(); +namespace KokkosSparse { +namespace Impl { +namespace Experimental { - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : - // (rhs_rowid+diff)/values(soffset); - lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid); +template +struct SptrsvWrap { + // + // Useful types + // + using execution_space = typename TriSolveHandle::execution_space; + using memory_space = typename TriSolveHandle::memory_space; + using temp_mem_space = typename TriSolveHandle::HandleTempMemorySpace; + using lno_t = typename TriSolveHandle::nnz_lno_t; + using size_type = typename TriSolveHandle::size_type; + using scalar_t = typename TriSolveHandle::scalar_t; + using row_map_t = typename TriSolveHandle::nnz_row_view_t; + using entries_t = typename TriSolveHandle::nnz_lno_view_t; + using values_t = typename TriSolveHandle::nnz_scalar_view_t; + using work_view_t = + Kokkos::View>; + using work_view_int_t = + Kokkos::View>; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename TriSolveHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename TriSolveHandle::RangePolicy; + using range_type = Kokkos::pair; + + // Tag structs + struct UnsortedTag {}; + struct LargerCutoffTag {}; + struct UnsortedLargerCutoffTag {}; + + template + static void print_view1d_solve(const ViewType dv, size_t range = 0) { + auto v = Kokkos::create_mirror_view(dv); + Kokkos::deep_copy(v, dv); + std::cout << "Output for view " << v.label() << std::endl; + range = range == 0 ? dv.extent(0) : range; + for (size_t i = 0; i < range; ++i) { + std::cout << "v(" << i << ") = " << v(i) << " , "; } + std::cout << std::endl; } -}; - -template -struct TriLvlSchedTP2SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - const bool is_lowertri; - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - long dense_nrows; - - TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - const bool is_lowertri_, long node_count_, - long node_groups_ = 0, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), - node_count(node_count_), - node_groups(node_groups_), - dense_nrows(dense_nrows_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange + // Needed for cudagraphs + struct EmptyFunctor { + KOKKOS_INLINE_FUNCTION + void operator()(const int) const {} + }; + + // This functor unifies the lower and upper implementations, the hope is the + // "is_lowertri" check does not add noticable time on larger problems + template + struct TriLvlSchedTP1SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + const bool is_lowertri; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + + TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const bool &is_lowertri_, + const long &node_count_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + is_lowertri(is_lowertri_), + node_count(node_count_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); - team.team_barrier(); - } + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + team.team_barrier(); - size_t nrows = row_map.extent(0) - 1; + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) + : (rhs_rowid + diff) / values(soffset); + } + } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange + auto diag = -1; - team.team_barrier(); - } -}; - -// Lower vs Upper Multi-block Functors - -template -struct LowerTriLvlSchedRPSolverFunctor { - typedef typename EntriesType::non_const_value_type lno_t; - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - - for (long ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } - } // end for ptr - } + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + team.team_barrier(); - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - auto diag = -1; - - for (long ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - diag = ptr; + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + lhs(rowid) = (rhs_rowid + diff) / values(diag); } - } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); - } -}; - -template -struct LowerTriLvlSchedTP1SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + } + }; + + template + struct TriLvlSchedTP1SolverFunctorDiagValues { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + ValuesType diagonal_values; // inserted according to rowid + + const bool is_lowertri; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long dense_nrows; + + TriLvlSchedTP1SolverFunctorDiagValues( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const ValuesType &diagonal_values_, const bool is_lowertri_, + long node_count_, long dense_nrows_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + diagonal_values(diagonal_values_), + is_lowertri(is_lowertri_), + node_count(node_count_), + dense_nrows(dense_nrows_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); - team.team_barrier(); + team.team_barrier(); - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : + // (rhs_rowid+diff)/values(soffset); + lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid); + } } - } + }; + + template + struct TriLvlSchedTP2SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + const bool is_lowertri; + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + long dense_nrows; + + TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const bool is_lowertri_, long node_count_, + long node_groups_ = 0, long dense_nrows_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + is_lowertri(is_lowertri_), + node_count(node_count_), + node_groups(node_groups_), + dense_nrows(dense_nrows_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = is_lowertri + ? (rhs_rowid + diff) / values(eoffset - 1) + : (rhs_rowid + diff) / values(soffset); + } // end if + }); // end TeamThreadRange - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); + team.team_barrier(); } - } -}; - -// FIXME CUDA: This algorithm not working with all integral type combos -// In any case, this serves as a skeleton for 3-level hierarchical parallelism -// for alg dev -template -struct LowerTriLvlSchedTP2SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } // end if + }); // end TeamThreadRange - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid + team.team_barrier(); + } + }; + + // Lower vs Upper Multi-block Functors + + template + struct LowerTriLvlSchedRPSolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + // Assuming indices are sorted per row, diag entry is final index in the + // list + + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + + for (long ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); + } else { + lhs(rowid) = rhs_rowid / val; + } + } // end for ptr + } - size_t nrows = row_map.extent(0) - 1; + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + auto diag = -1; + + for (long ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); + } else { + diag = ptr; + } + } // end for ptr + lhs(rowid) = rhs_rowid / values(diag); + } + }; + + template + struct LowerTriLvlSchedTP1SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); + team.team_barrier(); - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); + } + } - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - team.team_barrier(); - } -}; + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) -// ----------------------------------------------------------- -// Helper functors for Lower-triangular solve with SpMV -template -struct SparseTriSupernodalSpMVFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; + auto diag = -1; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + team.team_barrier(); - using scalar_t = typename LHSType::non_const_value_type; + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if (my_rank == 0) { + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } + } + }; + + // FIXME CUDA: This algorithm not working with all integral type combos + // In any case, this serves as a skeleton for 3-level hierarchical parallelism + // for alg dev + template + struct LowerTriLvlSchedTP2SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); + } // end if + }); // end TeamThreadRange - using work_view_t = - typename Kokkos::View>; - - int flag; - long node_count; - NGBLType nodes_grouped_by_level; - - const int *supercols; - const int *workoffset; - - LHSType X; - work_view_t work; - - // constructor - SparseTriSupernodalSpMVFunctor(int flag_, long node_count_, - const NGBLType &nodes_grouped_by_level_, - const int *supercols_, const int *workoffset_, - LHSType &X_, work_view_t work_) - : flag(flag_), - node_count(node_count_), - nodes_grouped_by_level(nodes_grouped_by_level_), - supercols(supercols_), - workoffset(workoffset_), - X(X_), - work(work_) {} - - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); + team.team_barrier(); + } - auto s = nodes_grouped_by_level(node_count + league_rank); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } // end if + }); // end TeamThreadRange - // copy vector elements for the diagonal to input vector (work) - // and zero out the corresponding elements in output (X) - int w1 = workoffset[s]; - int j1 = supercols[s]; - // number of columns in the s-th supernode column - int nscol = supercols[s + 1] - j1; + team.team_barrier(); + } + }; - if (flag == -2) { - // copy X to work - for (int j = team_rank; j < nscol; j += team_size) { - work(w1 + j) = X(j1 + j); - } - } else if (flag == -1) { - // copy work to X - for (int j = team_rank; j < nscol; j += team_size) { - X(j1 + j) = work(w1 + j); - } - } else if (flag == 1) { - for (int j = team_rank; j < nscol; j += team_size) { - work(w1 + j) = X(j1 + j); - X(j1 + j) = zero; - } - } else { - // reinitialize work to zero - for (int j = team_rank; j < nscol; j += team_size) { - work(w1 + j) = zero; +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) + // ----------------------------------------------------------- + // Helper functors for Lower-triangular solve with SpMV + template + struct SparseTriSupernodalSpMVFunctor { + int flag; + long node_count; + entries_t nodes_grouped_by_level; + + const int *supercols; + const int *workoffset; + + LHSType X; + work_view_t work; + + // constructor + SparseTriSupernodalSpMVFunctor(int flag_, long node_count_, + const entries_t &nodes_grouped_by_level_, + const int *supercols_, + const int *workoffset_, LHSType &X_, + work_view_t work_) + : flag(flag_), + node_count(node_count_), + nodes_grouped_by_level(nodes_grouped_by_level_), + supercols(supercols_), + workoffset(workoffset_), + X(X_), + work(work_) {} + + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // copy vector elements for the diagonal to input vector (work) + // and zero out the corresponding elements in output (X) + int w1 = workoffset[s]; + int j1 = supercols[s]; + // number of columns in the s-th supernode column + int nscol = supercols[s + 1] - j1; + + if (flag == -2) { + // copy X to work + for (int j = team_rank; j < nscol; j += team_size) { + work(w1 + j) = X(j1 + j); + } + } else if (flag == -1) { + // copy work to X + for (int j = team_rank; j < nscol; j += team_size) { + X(j1 + j) = work(w1 + j); + } + } else if (flag == 1) { + for (int j = team_rank; j < nscol; j += team_size) { + work(w1 + j) = X(j1 + j); + X(j1 + j) = zero; + } + } else { + // reinitialize work to zero + for (int j = team_rank; j < nscol; j += team_size) { + work(w1 + j) = zero; + } } + team.team_barrier(); } - team.team_barrier(); - } -}; - -// ----------------------------------------------------------- -// Functor for Lower-triangular solve -template -struct LowerTriSupernodalFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - using scalar_t = typename ValuesType::non_const_value_type; + }; + + // ----------------------------------------------------------- + // Functor for Lower-triangular solve + template + struct LowerTriSupernodalFunctor { + const bool unit_diagonal; + const bool invert_diagonal; + const bool invert_offdiagonal; + const int *supercols; + ColptrView colptr; + RowindType rowind; + ValuesType values; + + int level; + work_view_int_t kernel_type; + work_view_int_t diag_kernel_type; + + LHSType X; + + work_view_t work; // needed with gemv for update&scatter + work_view_int_t work_offset; + + entries_t nodes_grouped_by_level; + + long node_count; + + // constructor + LowerTriSupernodalFunctor( // supernode info + const bool unit_diagonal_, const bool invert_diagonal_, + const bool invert_offdiagonal_, const int *supercols_, + // L in CSC + const ColptrView &colptr_, const RowindType &rowind_, + const ValuesType &values_, + // options to pick kernel type + int level_, work_view_int_t &kernel_type_, + work_view_int_t &diag_kernel_type_, + // right-hand-side (input), solution (output) + LHSType &X_, + // workspace + work_view_t work_, work_view_int_t &work_offset_, + // + const entries_t &nodes_grouped_by_level_, long node_count_) + : unit_diagonal(unit_diagonal_), + invert_diagonal(invert_diagonal_), + invert_offdiagonal(invert_offdiagonal_), + supercols(supercols_), + colptr(colptr_), + rowind(rowind_), + values(values_), + level(level_), + kernel_type(kernel_type_), + diag_kernel_type(diag_kernel_type_), + X(X_), + work(work_), + work_offset(work_offset_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_) {} + + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + /* ---------------------------------------------------------------------- + */ + /* get inputs */ + /* ---------------------------------------------------------------------- + */ + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + const scalar_t one(1.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // supernodal column size + const int j1 = supercols[s]; + const int j2 = supercols[s + 1]; + // > number of columns in the s-th supernode column + const int nscol = j2 - j1; + // "total" number of rows in all the supernodes (diagonal+off-diagonal) + const int i1 = colptr(j1); + const int nsrow = colptr(j1 + 1) - i1; + + // create a view for the s-th supernocal column + // NOTE: we currently supports only default_layout = LayoutLeft + scalar_t *dataL = const_cast(values.data()); + Kokkos::View + viewL(&dataL[i1], nsrow, nscol); + + // extract part of the solution, corresponding to the diagonal block + auto Xj = Kokkos::subview(X, range_type(j1, j2)); - using integer_view_t = Kokkos::View; - using work_view_t = - typename Kokkos::View>; - - using range_type = Kokkos::pair; - - const bool unit_diagonal; - const bool invert_diagonal; - const bool invert_offdiagonal; - const int *supercols; - ColptrView colptr; - RowindType rowind; - ValuesType values; - - int level; - integer_view_t kernel_type; - integer_view_t diag_kernel_type; - - LHSType X; - - work_view_t work; // needed with gemv for update&scatter - integer_view_t work_offset; - - NGBLType nodes_grouped_by_level; - - long node_count; - - // constructor - LowerTriSupernodalFunctor( // supernode info - const bool unit_diagonal_, const bool invert_diagonal_, - const bool invert_offdiagonal_, const int *supercols_, - // L in CSC - const ColptrView &colptr_, const RowindType &rowind_, - const ValuesType &values_, - // options to pick kernel type - int level_, integer_view_t &kernel_type_, - integer_view_t &diag_kernel_type_, - // right-hand-side (input), solution (output) - LHSType &X_, // workspace - work_view_t work_, integer_view_t &work_offset_, - // - const NGBLType &nodes_grouped_by_level_, long node_count_) - : unit_diagonal(unit_diagonal_), - invert_diagonal(invert_diagonal_), - invert_offdiagonal(invert_offdiagonal_), - supercols(supercols_), - colptr(colptr_), - rowind(rowind_), - values(values_), - level(level_), - kernel_type(kernel_type_), - diag_kernel_type(diag_kernel_type_), - X(X_), - work(work_), - work_offset(work_offset_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_) {} - - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ - /* get inputs */ - /* ---------------------------------------------------------------------- */ - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); - const scalar_t one(1.0); - - auto s = nodes_grouped_by_level(node_count + league_rank); + const int workoffset = work_offset(s); + auto Z = Kokkos::subview( + work, range_type(workoffset + nscol, workoffset + nsrow)); - // supernodal column size - const int j1 = supercols[s]; - const int j2 = supercols[s + 1]; - // > number of columns in the s-th supernode column - const int nscol = j2 - j1; - // "total" number of rows in all the supernodes (diagonal+off-diagonal) - const int i1 = colptr(j1); - const int nsrow = colptr(j1 + 1) - i1; + if (diag_kernel_type(level) != 3) { // not a device-level TRSM-solve + if (invert_offdiagonal) { + // combined TRSM solve with diagonal + GEMV update with off-diagonal + auto Y = Kokkos::subview( + work, + range_type( + workoffset, + workoffset + nsrow)); // needed for gemv instead of trmv/trsv + auto Ljj = + Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL()); + KokkosBlas::TeamGemv::invoke(team, + one, + Ljj, + Xj, + zero, + Y); + team.team_barrier(); + for (int ii = team_rank; ii < nscol; ii += team_size) { + Xj(ii) = Y(ii); + } + team.team_barrier(); + } else { + /* TRSM with diagonal block */ + // extract diagonal and off-diagonal blocks of L + auto Ljj = + Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); + if (invert_diagonal) { + // workspace + auto Y = Kokkos::subview( + work, + range_type(workoffset, + workoffset + + nscol)); // needed for gemv instead of trmv/trsv + for (int ii = team_rank; ii < nscol; ii += team_size) { + Y(ii) = Xj(ii); + } + team.team_barrier(); + // calling team-level "Unblocked" gemv on small-size diagonal in + // KokkosBatched + KokkosBlas::TeamGemv< + member_type, KokkosBlas::Trans::NoTranspose, + KokkosBlas::Algo::Gemv::Unblocked>::invoke(team, one, Ljj, Y, + zero, Xj); + } else { + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View + Xjj(Xj.data(), nscol, 1); + if (unit_diagonal) { + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Left, + KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, + Xjj); + } else { + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Left, + KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, + Xjj); + } + } + team.team_barrier(); - // create a view for the s-th supernocal column - // NOTE: we currently supports only default_layout = LayoutLeft - scalar_t *dataL = const_cast(values.data()); - Kokkos::View - viewL(&dataL[i1], nsrow, nscol); + /* GEMM to update with off diagonal blocks */ + auto Lij = + Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL()); + KokkosBlas::TeamGemv::invoke(team, + one, + Lij, + Xj, + zero, + Z); + team.team_barrier(); + } + } - // extract part of the solution, corresponding to the diagonal block - auto Xj = Kokkos::subview(X, range_type(j1, j2)); + /* scatter vectors back into X */ + int i2 = i1 + nscol; // offset into rowind + int nsrow2 = + nsrow - + nscol; // "total" number of rows in all the off-diagonal supernodes + Kokkos::View> + Xatomic(X.data(), X.extent(0)); + for (int ii = team_rank; ii < nsrow2; ii += team_size) { + int i = rowind(i2 + ii); + Xatomic(i) -= Z(ii); + } + team.team_barrier(); + } + }; - // workspace - const int workoffset = work_offset(s); - auto Z = Kokkos::subview( - work, range_type(workoffset + nscol, workoffset + nsrow)); - - if (diag_kernel_type(level) != 3) { // not a device-level TRSM-solve - if (invert_offdiagonal) { - // combined TRSM solve with diagonal + GEMV update with off-diagonal - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nsrow)); // needed for gemv instead of trmv/trsv - auto Ljj = Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Ljj, Xj, - zero, - Y); - team.team_barrier(); - for (int ii = team_rank; ii < nscol; ii += team_size) { - Xj(ii) = Y(ii); - } + // ----------------------------------------------------------- + // Functor for Upper-triangular solve in CSR + template + struct UpperTriSupernodalFunctor { + // NOTE: we currently supports only default_layout = LayoutLeft + using SupernodeView = + typename Kokkos::View; + + bool invert_diagonal; + const int *supercols; + ColptrType colptr; + RowindType rowind; + ValuesType values; + + int level; + work_view_int_t kernel_type; + work_view_int_t diag_kernel_type; + + LHSType X; + + work_view_t work; // needed with gemv for update&scatter + work_view_int_t work_offset; + + entries_t nodes_grouped_by_level; + + long node_count; + + // constructor + UpperTriSupernodalFunctor( // supernode info + bool invert_diagonal_, const int *supercols_, + // U in CSR + const ColptrType &colptr_, const RowindType &rowind_, + const ValuesType &values_, + // options to pick kernel type + int level_, work_view_int_t &kernel_type_, + work_view_int_t &diag_kernel_type_, + // right-hand-side (input), solution (output) + LHSType &X_, + // workspace + work_view_t &work_, work_view_int_t &work_offset_, + // + const entries_t &nodes_grouped_by_level_, long node_count_) + : invert_diagonal(invert_diagonal_), + supercols(supercols_), + colptr(colptr_), + rowind(rowind_), + values(values_), + level(level_), + kernel_type(kernel_type_), + diag_kernel_type(diag_kernel_type_), + X(X_), + work(work_), + work_offset(work_offset_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_) {} + + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + /* ---------------------------------------------------------------------- + */ + /* get inputs */ + /* ---------------------------------------------------------------------- + */ + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + const scalar_t one(1.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // number of columns in the s-th supernode column + int j1 = supercols[s]; + int j2 = supercols[s + 1]; + int nscol = j2 - j1; + // "total" number of rows in all the supernodes (diagonal+off-diagonal) + int i1 = colptr(j1); + int nsrow = colptr(j1 + 1) - i1; + + // create a view of the s-th supernocal row of U + scalar_t *dataU = const_cast(values.data()); + SupernodeView viewU(&dataU[i1], nsrow, nscol); + + // extract part of solution, corresponding to the diagonal block U(s, s) + auto Xj = Kokkos::subview(X, range_type(j1, j2)); + using Xj_type = decltype(Xj); + + // workspaces + int workoffset = work_offset(s); + + // "total" number of rows in all the off-diagonal supernodes + int nsrow2 = nsrow - nscol; + /* gather vector into Z */ + int i2 = i1 + nscol; // offset into rowind + auto Z = Kokkos::subview( + work, range_type(workoffset + nscol, + workoffset + + nsrow)); // needed with gemv for update&scatter + using Z_type = decltype(Z); + for (int ii = team_rank; ii < nsrow2; ii += team_size) { + int i = rowind(i2 + ii); + Z(ii) = X(i); + } + team.team_barrier(); + /* GEMM to update with off diagonal blocks, Xj = -Uij^T * Z */ + if (diag_kernel_type(level) != 3) { + // not device-level GEMV-udpate + auto Uij = + Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); + using Uij_type = decltype(Uij); + KokkosBlas::TeamGemv:: + template invoke( + team, -one, Uij, Z, one, Xj); team.team_barrier(); - } else { + /* TRSM with diagonal block */ - // extract diagonal and off-diagonal blocks of L - auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); + // extract diagonal and off-diagonal blocks of U + auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + using Ujj_type = decltype(Ujj); + if (invert_diagonal) { // workspace auto Y = Kokkos::subview( @@ -883,894 +1042,540 @@ struct LowerTriSupernodalFunctor { range_type( workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv + using Y_type = decltype(Y); for (int ii = team_rank; ii < nscol; ii += team_size) { Y(ii) = Xj(ii); } team.team_barrier(); - // calling team-level "Unblocked" gemv on small-size diagonal in - // KokkosBatched - KokkosBlas::TeamGemv::invoke(team, - one, - Ljj, - Y, - zero, - Xj); + + // caling team-level kernel in KokkosBatched on a small-size diagonal + KokkosBlas::TeamGemv:: + template invoke( + team, one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); - if (unit_diagonal) { - KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Left, - KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Diag::Unit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, - Xjj); + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Left, + KokkosBatched::Uplo::Lower, KokkosBatched::Trans::Transpose, + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, + Xjj); + } + team.team_barrier(); + } + } + }; + + // ----------------------------------------------------------- + // Functor for Upper-triangular solve in CSC + template + struct UpperTriTranSupernodalFunctor { + const bool invert_diagonal; + const bool invert_offdiagonal; + const int *supercols; + ColptrType colptr; + RowindType rowind; + ValuesType values; + + int level; + work_view_int_t kernel_type; + work_view_int_t diag_kernel_type; + + LHSType X; + + work_view_t work; // needed with gemv for update&scatter + work_view_int_t work_offset; + + entries_t nodes_grouped_by_level; + + long node_count; + + // constructor + UpperTriTranSupernodalFunctor( // supernode info + const bool invert_diagonal_, const bool invert_offdiagonal_, + const int *supercols_, + + // U in CSC + const ColptrType &colptr_, const RowindType &rowind_, + const ValuesType &values_, + // options to pick kernel type + const int level_, const work_view_int_t &kernel_type_, + const work_view_int_t &diag_kernel_type_, + // right-hand-side (input), solution (output) + const LHSType &X_, + // workspace + const work_view_t &work_, const work_view_int_t &work_offset_, + // + const entries_t &nodes_grouped_by_level_, const long node_count_) + : invert_diagonal(invert_diagonal_), + invert_offdiagonal(invert_offdiagonal_), + supercols(supercols_), + colptr(colptr_), + rowind(rowind_), + values(values_), + level(level_), + kernel_type(kernel_type_), + diag_kernel_type(diag_kernel_type_), + X(X_), + work(work_), + work_offset(work_offset_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_) {} + + // operator + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + /* ---------------------------------------------------------------------- + */ + /* get inputs */ + /* ---------------------------------------------------------------------- + */ + const int league_rank = team.league_rank(); // batch id + const int team_size = team.team_size(); + const int team_rank = team.team_rank(); + const scalar_t zero(0.0); + const scalar_t one(1.0); + + auto s = nodes_grouped_by_level(node_count + league_rank); + + // number of columns in the s-th supernode column + const int j1 = supercols[s]; + const int j2 = supercols[s + 1]; + const int nscol = j2 - j1; + // "total" number of rows in all the supernodes (diagonal+off-diagonal) + const int i1 = colptr(j1); + const int nsrow = colptr(j1 + 1) - i1; + // "total" number of rows in all the off-diagonal supernodes + const int nsrow2 = nsrow - nscol; + + // create a view of the s-th supernocal column of U + // NOTE: we currently supports only default_layout = LayoutLeft + scalar_t *dataU = const_cast(values.data()); + Kokkos::View + viewU(&dataU[i1], nsrow, nscol); + + // extract part of solution, corresponding to the diagonal block U(s, s) + auto Xj = Kokkos::subview(X, range_type(j1, j2)); + + // workspaces + int workoffset = work_offset(s); + + /* TRSM with diagonal block */ + if (diag_kernel_type(level) != 3) { + // not device-level TRSM-solve + if (invert_offdiagonal) { + // extract diagonal + off-diagonal blocks of U + auto Y = Kokkos::subview( + work, + range_type( + workoffset, + workoffset + nsrow)); // needed with gemv for update&scatter + auto Uij = + Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); + KokkosBlas::TeamGemv::invoke(team, + one, + Uij, + Xj, + zero, + Y); + team.team_barrier(); + // copy the diagonal back to output + for (int ii = team_rank; ii < nscol; ii += team_size) { + Xj(ii) = Y(ii); + } + } else { + // extract diagonal block of U (stored on top) + auto Ujj = + Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + if (invert_diagonal) { + auto Y = Kokkos::subview( + work, + range_type(workoffset, + workoffset + + nscol)); // needed for gemv instead of trmv/trsv + for (int ii = team_rank; ii < nscol; ii += team_size) { + Y(ii) = Xj(ii); + } + team.team_barrier(); + KokkosBlas::TeamGemv< + member_type, KokkosBatched::Trans::NoTranspose, + KokkosBlas::Algo::Gemv::Unblocked>::invoke(team, one, Ujj, Y, + zero, Xj); } else { + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View + Xjj(Xj.data(), nscol, 1); KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Left, - KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj, + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj); } } team.team_barrier(); + } + if (nsrow2 > 0) { + /* GEMM to update off diagonal blocks, Z = Uij * Xj */ + auto Z = Kokkos::subview( + work, range_type(workoffset + nscol, workoffset + nsrow)); + if (!invert_offdiagonal && diag_kernel_type(level) != 3) { + // not device-level TRSM-solve + auto Uij = + Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); + KokkosBlas::TeamGemv::invoke(team, + one, + Uij, + Xj, + zero, + Z); + team.team_barrier(); + } - /* GEMM to update with off diagonal blocks */ - auto Lij = - Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Lij, Xj, - zero, - Z); + /* scatter vector into Z */ + int i2 = i1 + nscol; // offset into rowind + Kokkos::View> + Xatomic(X.data(), X.extent(0)); + for (int ii = team_rank; ii < nsrow2; ii += team_size) { + int i = rowind(i2 + ii); + Xatomic(i) -= Z(ii); + } team.team_barrier(); } } + }; +#endif - /* scatter vectors back into X */ - int i2 = i1 + nscol; // offset into rowind - int nsrow2 = - nsrow - - nscol; // "total" number of rows in all the off-diagonal supernodes - Kokkos::View> - Xatomic(X.data(), X.extent(0)); - for (int ii = team_rank; ii < nsrow2; ii += team_size) { - int i = rowind(i2 + ii); - Xatomic(i) -= Z(ii); - } - team.team_barrier(); - } -}; - -// ----------------------------------------------------------- -// Functor for Upper-triangular solve in CSR -template -struct UpperTriSupernodalFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - using scalar_t = typename ValuesType::non_const_value_type; - - using integer_view_t = Kokkos::View; - using work_view_t = - typename Kokkos::View>; - - // NOTE: we currently supports only default_layout = LayoutLeft - using SupernodeView = - typename Kokkos::View; - - using range_type = Kokkos::pair; - - bool invert_diagonal; - const int *supercols; - ColptrType colptr; - RowindType rowind; - ValuesType values; - - int level; - integer_view_t kernel_type; - integer_view_t diag_kernel_type; - - LHSType X; - - work_view_t work; // needed with gemv for update&scatter - integer_view_t work_offset; - - NGBLType nodes_grouped_by_level; - - long node_count; - - // constructor - UpperTriSupernodalFunctor( // supernode info - bool invert_diagonal_, const int *supercols_, - // U in CSR - const ColptrType &colptr_, const RowindType &rowind_, - const ValuesType &values_, - // options to pick kernel type - int level_, integer_view_t &kernel_type_, - integer_view_t &diag_kernel_type_, - // right-hand-side (input), solution (output) - LHSType &X_, - // workspace - work_view_t &work_, integer_view_t &work_offset_, - // - const NGBLType &nodes_grouped_by_level_, long node_count_) - : invert_diagonal(invert_diagonal_), - supercols(supercols_), - colptr(colptr_), - rowind(rowind_), - values(values_), - level(level_), - kernel_type(kernel_type_), - diag_kernel_type(diag_kernel_type_), - X(X_), - work(work_), - work_offset(work_offset_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_) {} - - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ - /* get inputs */ - /* ---------------------------------------------------------------------- */ - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); - const scalar_t one(1.0); - - auto s = nodes_grouped_by_level(node_count + league_rank); - - // number of columns in the s-th supernode column - int j1 = supercols[s]; - int j2 = supercols[s + 1]; - int nscol = j2 - j1; - // "total" number of rows in all the supernodes (diagonal+off-diagonal) - int i1 = colptr(j1); - int nsrow = colptr(j1 + 1) - i1; - - // create a view of the s-th supernocal row of U - scalar_t *dataU = const_cast(values.data()); - SupernodeView viewU(&dataU[i1], nsrow, nscol); - - // extract part of solution, corresponding to the diagonal block U(s, s) - auto Xj = Kokkos::subview(X, range_type(j1, j2)); - using Xj_type = decltype(Xj); - - // workspaces - int workoffset = work_offset(s); - - // "total" number of rows in all the off-diagonal supernodes - int nsrow2 = nsrow - nscol; - /* gather vector into Z */ - int i2 = i1 + nscol; // offset into rowind - auto Z = Kokkos::subview( - work, - range_type(workoffset + nscol, - workoffset + nsrow)); // needed with gemv for update&scatter - using Z_type = decltype(Z); - for (int ii = team_rank; ii < nsrow2; ii += team_size) { - int i = rowind(i2 + ii); - Z(ii) = X(i); - } - team.team_barrier(); - /* GEMM to update with off diagonal blocks, Xj = -Uij^T * Z */ - if (diag_kernel_type(level) != 3) { - // not device-level GEMV-udpate - auto Uij = - Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); - using Uij_type = decltype(Uij); - KokkosBlas::TeamGemv:: - template invoke( - team, -one, Uij, Z, one, Xj); - team.team_barrier(); - - /* TRSM with diagonal block */ - // extract diagonal and off-diagonal blocks of U - auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); - using Ujj_type = decltype(Ujj); - - if (invert_diagonal) { - // workspace - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nscol)); // needed for gemv instead of trmv/trsv - using Y_type = decltype(Y); - for (int ii = team_rank; ii < nscol; ii += team_size) { - Y(ii) = Xj(ii); + template + struct UpperTriLvlSchedRPSolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + // Assuming indices are sorted per row, diag entry is final index in the + // list + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); + } else { + lhs(rowid) = rhs_rowid / val; } - team.team_barrier(); - - // caling team-level kernel in KokkosBatched on a small-size diagonal - KokkosBlas::TeamGemv:: - template invoke( - team, one, Ujj, Y, zero, Xj); - } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View - Xjj(Xj.data(), nscol, 1); - KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, - KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj); - } - team.team_barrier(); + } // end for ptr } - } -}; - -// ----------------------------------------------------------- -// Functor for Upper-triangular solve in CSC -template -struct UpperTriTranSupernodalFunctor { - using execution_space = typename TriSolveHandle::HandleExecSpace; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - using scalar_t = typename ValuesType::non_const_value_type; - - using integer_view_t = Kokkos::View; - using work_view_t = - typename Kokkos::View>; - - using range_type = Kokkos::pair; - - const bool invert_diagonal; - const bool invert_offdiagonal; - const int *supercols; - ColptrType colptr; - RowindType rowind; - ValuesType values; - - int level; - integer_view_t kernel_type; - integer_view_t diag_kernel_type; - - LHSType X; - - work_view_t work; // needed with gemv for update&scatter - integer_view_t work_offset; - - NGBLType nodes_grouped_by_level; - - long node_count; - - // constructor - UpperTriTranSupernodalFunctor( // supernode info - const bool invert_diagonal_, const bool invert_offdiagonal_, - const int *supercols_, - - // U in CSC - const ColptrType &colptr_, const RowindType &rowind_, - const ValuesType &values_, - // options to pick kernel type - const int level_, const integer_view_t &kernel_type_, - const integer_view_t &diag_kernel_type_, - // right-hand-side (input), solution (output) - const LHSType &X_, - // workspace - const work_view_t &work_, const integer_view_t &work_offset_, - // - const NGBLType &nodes_grouped_by_level_, const long node_count_) - : invert_diagonal(invert_diagonal_), - invert_offdiagonal(invert_offdiagonal_), - supercols(supercols_), - colptr(colptr_), - rowind(rowind_), - values(values_), - level(level_), - kernel_type(kernel_type_), - diag_kernel_type(diag_kernel_type_), - X(X_), - work(work_), - work_offset(work_offset_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_) {} - - // operator - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - /* ---------------------------------------------------------------------- */ - /* get inputs */ - /* ---------------------------------------------------------------------- */ - const int league_rank = team.league_rank(); // batch id - const int team_size = team.team_size(); - const int team_rank = team.team_rank(); - const scalar_t zero(0.0); - const scalar_t one(1.0); - - auto s = nodes_grouped_by_level(node_count + league_rank); - - // number of columns in the s-th supernode column - const int j1 = supercols[s]; - const int j2 = supercols[s + 1]; - const int nscol = j2 - j1; - // "total" number of rows in all the supernodes (diagonal+off-diagonal) - const int i1 = colptr(j1); - const int nsrow = colptr(j1 + 1) - i1; - // "total" number of rows in all the off-diagonal supernodes - const int nsrow2 = nsrow - nscol; - - // create a view of the s-th supernocal column of U - // NOTE: we currently supports only default_layout = LayoutLeft - scalar_t *dataU = const_cast(values.data()); - Kokkos::View - viewU(&dataU[i1], nsrow, nscol); - - // extract part of solution, corresponding to the diagonal block U(s, s) - auto Xj = Kokkos::subview(X, range_type(j1, j2)); - - // workspaces - int workoffset = work_offset(s); - - /* TRSM with diagonal block */ - if (diag_kernel_type(level) != 3) { - // not device-level TRSM-solve - if (invert_offdiagonal) { - // extract diagonal + off-diagonal blocks of U - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nsrow)); // needed with gemv for update&scatter - auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Uij, Xj, - zero, - Y); - team.team_barrier(); - // copy the diagonal back to output - for (int ii = team_rank; ii < nscol; ii += team_size) { - Xj(ii) = Y(ii); - } - } else { - // extract diagonal block of U (stored on top) - auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); - if (invert_diagonal) { - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + nscol)); // needed for gemv instead of trmv/trsv - for (int ii = team_rank; ii < nscol; ii += team_size) { - Y(ii) = Xj(ii); - } - team.team_barrier(); - KokkosBlas::TeamGemv::invoke(team, - one, - Ujj, - Y, - zero, - Xj); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const lno_t i) const { + auto rowid = nodes_grouped_by_level(i); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + auto diag = -1; + for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + rhs_rowid = rhs_rowid - val * lhs(colid); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View - Xjj(Xj.data(), nscol, 1); - KokkosBatched::TeamTrsm< - member_type, KokkosBatched::Side::Left, - KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, - Xjj); + diag = ptr; } - } - team.team_barrier(); + } // end for ptr + lhs(rowid) = rhs_rowid / values(diag); } - if (nsrow2 > 0) { - /* GEMM to update off diagonal blocks, Z = Uij * Xj */ - auto Z = Kokkos::subview( - work, range_type(workoffset + nscol, workoffset + nsrow)); - if (!invert_offdiagonal && diag_kernel_type(level) != 3) { - // not device-level TRSM-solve - auto Uij = - Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Uij, Xj, - zero, - Z); - team.team_barrier(); - } + }; + + template + struct UpperTriLvlSchedTP1SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); - /* scatter vector into Z */ - int i2 = i1 + nscol; // offset into rowind - Kokkos::View> - Xatomic(X.data(), X.extent(0)); - for (int ii = team_rank; ii < nsrow2; ii += team_size) { - int i = rowind(i2 + ii); - Xatomic(i) -= Z(ii); - } team.team_barrier(); - } - } -}; -#endif - -template -struct UpperTriLvlSchedRPSolverFunctor { - typedef typename EntriesType::non_const_value_type lno_t; - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } - } // end for ptr - } - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - auto diag = -1; - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - diag = ptr; + // At end, finalize rowid == colid + // only one thread should do this, also can use Kokkos::single + if (my_rank == 0) { + // ASSUMPTION: sorted diagonal value located at start offset + lhs(rowid) = (rhs_rowid + diff) / values(soffset); } - } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); - } -}; - -template -struct UpperTriLvlSchedTP1SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } - } -}; - -// FIXME CUDA: This algorithm not working with all integral type combos -// In any case, this serves as a skeleton for 3-level hierarchical parallelism -// for alg dev -template -struct UpperTriLvlSchedTP2SolverFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - size_t nrows = row_map.extent(0) - 1; + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = - nodes_grouped_by_level(node_count + my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange + auto diag = -1; - team.team_barrier(); - } -}; - -// -------------------------------- -// Single-block functors -// -------------------------------- - -template -struct LowerTriLvlSchedTP1SingleBlockFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - LowerTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + team.team_barrier(); -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; + // At end, finalize rowid == colid + // only one thread should do this, also can use Kokkos::single + if (my_rank == 0) { + lhs(rowid) = (rhs_rowid + diff) / values(diag); } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + } + }; + + // FIXME CUDA: This algorithm not working with all integral type combos + // In any case, this serves as a skeleton for 3-level hierarchical parallelism + // for alg dev + template + struct UpperTriLvlSchedTP2SolverFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long node_groups; + + UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + long node_count_, long node_groups_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + node_count(node_count_), + node_groups(node_groups_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } + }, + diff); + + // ASSUMPTION: sorted diagonal value located at start offset + lhs(rowid) = (rhs_rowid + diff) / values(soffset); + } // end if + }); // end TeamThreadRange -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; + team.team_barrier(); + } - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { + auto rowid = nodes_grouped_by_level(node_count + + my_league * node_groups + ng); + if (size_t(rowid) < nrows) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid + 1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, soffset, eoffset), + [&](const long ptr, scalar_t &tdiff) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff = tdiff - val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); + + lhs(rowid) = (rhs_rowid + diff) / values(diag); + } // end if + }); // end TeamThreadRange - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); + } + }; + + // -------------------------------- + // Single-block functors + // -------------------------------- + + template + struct LowerTriLvlSchedTP1SingleBlockFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + long cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + LowerTriLvlSchedTP1SingleBlockFunctor( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); eoffset = row_map(rowid + 1); rhs_val = rhs(rowid); @@ -1797,38 +1602,31 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri + // ASSUMPTION: sorted diagonal value located at eoffset - 1 lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens rowid = nodes_grouped_by_level(my_rank + mut_node_count); @@ -1851,7 +1649,8 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; + auto ptr = soffset + loffset; + auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { @@ -1864,194 +1663,189 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor { #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; - -template -struct UpperTriLvlSchedTP1SingleBlockFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - UpperTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { - tdiff -= val * lhs(colid); + diff -= val * lhs(colid); } - }, - diff); + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif - // ASSUMPTION: sorted diagonal value located at soffset - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for + // lower tri, soffset for upper tri + lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; } -#else - auto trange = eoffset - soffset; - auto diag = -1; + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; + diff -= val * lhs(colid); } - }, - diff); + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; + + template + struct UpperTriLvlSchedTP1SingleBlockFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + long cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + UpperTriLvlSchedTP1SingleBlockFunctor( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens rowid = nodes_grouped_by_level(my_rank + mut_node_count); @@ -2081,38 +1875,32 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { }, diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri + // ASSUMPTION: sorted diagonal value located at soffset lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); + } // end if + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl each thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens rowid = nodes_grouped_by_level(my_rank + mut_node_count); @@ -2134,6 +1922,7 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { #else auto trange = eoffset - soffset; auto diag = -1; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, trange), [&](const int loffset, scalar_t &tdiff) { @@ -2149,204 +1938,199 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor { diff); #endif lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; - -template -struct TriLvlSchedTP1SingleBlockFunctor { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const bool is_lowertri; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, - const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - is_lowertri(is_lower_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } + } // end if + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl each thread + mut_node_count += nodes_this_lvl; } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { - tdiff -= val * lhs(colid); + diff -= val * lhs(colid); } - }, - diff); + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - if (is_lowertri) - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - else - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for + // lower tri, soffset for upper tri + lhs(rowid) = (rhs_val + diff) / values(soffset); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { - tdiff -= val * lhs(colid); + diff -= val * lhs(colid); } else { diag = ptr; } - }, - diff); + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; + + template + struct TriLvlSchedTP1SingleBlockFunctor { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + const bool is_lowertri; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + TriLvlSchedTP1SingleBlockFunctor( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, + const int dense_nrows_ = 0, const int cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + is_lowertri(is_lower_), + dense_nrows(dense_nrows_), + cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens rowid = nodes_grouped_by_level(my_rank + mut_node_count); @@ -2384,34 +2168,29 @@ struct TriLvlSchedTP1SingleBlockFunctor { else lhs(rowid) = (rhs_val + diff) / values(soffset); } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens rowid = nodes_grouped_by_level(my_rank + mut_node_count); @@ -2449,143 +2228,205 @@ struct TriLvlSchedTP1SingleBlockFunctor { #endif lhs(rowid) = (rhs_val + diff) / values(diag); } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; - -template -struct TriLvlSchedTP1SingleBlockFunctorDiagValues { - typedef typename RowMapType::execution_space execution_space; - typedef Kokkos::TeamPolicy policy_type; - typedef typename policy_type::member_type member_type; - typedef typename EntriesType::non_const_value_type lno_t; - typedef typename ValuesType::non_const_value_type scalar_t; - - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - NGBLType nodes_grouped_by_level; - NGBLType nodes_per_level; - ValuesType diagonal_values; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const bool is_lowertri; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctorDiagValues( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const NGBLType &nodes_grouped_by_level_, const NGBLType &nodes_per_level_, - const ValuesType &diagonal_values_, long node_count_, - const long lvl_start_, const long lvl_end_, const bool is_lower_, - const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - diagonal_values(diagonal_values_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - is_lowertri(is_lower_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); #ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); +#endif + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for + // lower tri, soffset for upper tri + if (is_lowertri) + lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); + else + lhs(rowid) = (rhs_val + diff) / values(soffset); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); if (colid != rowid) { - tdiff -= val * lhs(colid); + diff -= val * lhs(colid); + } else { + diag = ptr; } - }, - diff); + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } else { + diag = ptr; + } + }, + diff); #endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename NGBLType::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); + lhs(rowid) = (rhs_val + diff) / values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; + + template + struct TriLvlSchedTP1SingleBlockFunctorDiagValues { + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + entries_t nodes_grouped_by_level; + entries_t nodes_per_level; + ValuesType diagonal_values; + + long node_count; // like "block" offset into ngbl, my_league is the "local" + // offset + long lvl_start; + long lvl_end; + const bool is_lowertri; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + TriLvlSchedTP1SingleBlockFunctorDiagValues( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const entries_t &nodes_per_level_, const ValuesType &diagonal_values_, + long node_count_, const long lvl_start_, const long lvl_end_, + const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0) + : row_map(row_map_), + entries(entries_), + values(values_), + lhs(lhs_), + rhs(rhs_), + nodes_grouped_by_level(nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), + diagonal_values(diagonal_values_), + node_count(node_count_), + lvl_start(lvl_start_), + lvl_end(lvl_end_), + is_lowertri(is_lower_), + dense_nrows(dense_nrows_), + cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { // THIS is where the mapping of threadid to rowid happens rowid = nodes_grouped_by_level(my_rank + mut_node_count); @@ -2609,366 +2450,301 @@ struct TriLvlSchedTP1SingleBlockFunctorDiagValues { auto ptr = soffset + loffset; auto colid = entries(ptr); auto val = values(ptr); + if (colid != rowid) { tdiff -= val * lhs(colid); } }, diff); #endif + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower + // tri, soffset for upper tri lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl - // per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator -}; - -#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT -template -struct ReturnTeamPolicyType; - -#ifdef KOKKOS_ENABLE_SERIAL -template <> -struct ReturnTeamPolicyType { - using PolicyType = Kokkos::TeamPolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_OPENMP -template <> -struct ReturnTeamPolicyType { - using PolicyType = Kokkos::TeamPolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } - - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct ReturnTeamPolicyType { - using PolicyType = Kokkos::TeamPolicy; - - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()(const LargerCutoffTag &, const member_type &team) const { + long mut_node_count = node_count; + typename entries_t::non_const_value_type rowid{0}; + typename RowMapType::non_const_value_type soffset{0}; + typename RowMapType::non_const_value_type eoffset{0}; + typename RHSType::non_const_value_type rhs_val{0}; + scalar_t diff = scalar_t(0.0); + + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; + my_rank += team.team_size()) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid + 1); + rhs_val = rhs(rowid); - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { - return PolicyType(stream, nt, ts); - } -}; +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + diff -= val * lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, trange), + [&](const int loffset, scalar_t &tdiff) { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if (colid != rowid) { + tdiff -= val * lhs(colid); + } + }, + diff); #endif + lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of + // lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + }; -template -struct ReturnRangePolicyType; - -#ifdef KOKKOS_ENABLE_SERIAL -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + template + static void lower_tri_solve_cg(TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { + typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = + thandle.get_sptrsvCudaGraph(); + + auto nlevels = thandle.get_num_levels(); + + auto stream1 = lcl_cudagraph->stream; + Kokkos::Cuda cuda1(stream1); + auto graph = lcl_cudagraph->cudagraph; + + Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), + EmptyFunctor()); + Kokkos::Cuda().fence(); + cudaStreamSynchronize(stream1); + // Kokkos::fence(); + + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + + size_type node_count = 0; + + int team_size = thandle.get_team_size(); + team_size = team_size == -1 ? 64 : team_size; + + // Start capturing stream + if (thandle.cudagraphCreated == false) { + Kokkos::fence(); + cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); + { + for (int iter = 0; iter < nlevels; ++iter) { + size_type lvl_nodes = hnodes_per_level(iter); - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } + auto policy = std::is_same::value + ? team_policy(lvl_nodes, team_size, cuda1) + : team_policy(lvl_nodes, team_size); - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_OPENMP -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; + Kokkos::parallel_for( + "parfor_l_team_cudagraph", + Kokkos::Experimental::require( + policy, + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + LowerTriLvlSchedTP1SolverFunctor( + row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count)); - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } + node_count += hnodes_per_level(iter); + } + } + cudaStreamEndCapture(stream1, &graph); - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) { - return PolicyType(nt, ts); - // return PolicyType(ExecInstanceType(),nt,ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_CUDA -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; + // Create graphExec + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, + NULL, 0); + thandle.cudagraphCreated = true; + } + // Run graph + Kokkos::fence(); + cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } + cudaStreamSynchronize(stream1); + Kokkos::fence(); + } // end lower_tri_solve_cg - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { - return PolicyType(stream, nt, ts); - } -}; -#endif -#ifdef KOKKOS_ENABLE_HIP -template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; + template + static void upper_tri_solve_cg(TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { + typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = + thandle.get_sptrsvCudaGraph(); - static inline PolicyType get_policy(int nt, int ts) { - return PolicyType(nt, ts); - } + auto nlevels = thandle.get_num_levels(); - template - static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { - return PolicyType(stream, nt, ts); - } -}; -#endif + auto stream1 = lcl_cudagraph->stream; + Kokkos::Cuda cuda1(stream1); + auto graph = lcl_cudagraph->cudagraph; -template -void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - typedef typename TriSolveHandle::execution_space execution_space; - typedef typename TriSolveHandle::size_type size_type; - typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = - thandle.get_sptrsvCudaGraph(); + Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), + EmptyFunctor()); + Kokkos::Cuda().fence(); + cudaStreamSynchronize(stream1); - auto nlevels = thandle.get_num_levels(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - auto stream1 = lcl_cudagraph->stream; - Kokkos::Cuda cuda1(stream1); - auto graph = lcl_cudagraph->cudagraph; + size_type node_count = 0; - Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), - EmptyFunctor()); - Kokkos::Cuda().fence(); - cudaStreamSynchronize(stream1); - // Kokkos::fence(); + int team_size = thandle.get_team_size(); + team_size = team_size == -1 ? 64 : team_size; - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + // Start capturing stream + if (thandle.cudagraphCreated == false) { + Kokkos::fence(); + cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); + { + for (int iter = 0; iter < nlevels; ++iter) { + size_type lvl_nodes = hnodes_per_level(iter); - size_type node_count = 0; + auto policy = std::is_same::value + ? team_policy(lvl_nodes, team_size, cuda1) + : team_policy(lvl_nodes, team_size); - int team_size = thandle.get_team_size(); - team_size = team_size == -1 ? 64 : team_size; + Kokkos::parallel_for( + "parfor_u_team_cudagraph", + Kokkos::Experimental::require( + policy, + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + UpperTriLvlSchedTP1SolverFunctor( + row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count)); - // Start capturing stream - if (thandle.cudagraphCreated == false) { - Kokkos::fence(); - cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); - { - for (int iter = 0; iter < nlevels; ++iter) { - size_type lvl_nodes = hnodes_per_level(iter); - - using policy_type = ReturnTeamPolicyType; - - Kokkos::parallel_for( - "parfor_l_team_cudagraph", - Kokkos::Experimental::require( - ReturnTeamPolicyType::get_policy( - lvl_nodes, team_size, cuda1), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - LowerTriLvlSchedTP1SolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count)); - - node_count += hnodes_per_level(iter); + node_count += hnodes_per_level(iter); + } } - } - cudaStreamEndCapture(stream1, &graph); + cudaStreamEndCapture(stream1, &graph); - // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, - 0); - thandle.cudagraphCreated = true; - } - // Run graph - Kokkos::fence(); - cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); - - cudaStreamSynchronize(stream1); - Kokkos::fence(); -} // end lower_tri_solve_cg - -template -void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - typedef typename TriSolveHandle::execution_space execution_space; - typedef typename TriSolveHandle::size_type size_type; - typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = - thandle.get_sptrsvCudaGraph(); - - auto nlevels = thandle.get_num_levels(); - - auto stream1 = lcl_cudagraph->stream; - Kokkos::Cuda cuda1(stream1); - auto graph = lcl_cudagraph->cudagraph; - - Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), - EmptyFunctor()); - Kokkos::Cuda().fence(); - cudaStreamSynchronize(stream1); - - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - - size_type node_count = 0; - - int team_size = thandle.get_team_size(); - team_size = team_size == -1 ? 64 : team_size; - - // Start capturing stream - if (thandle.cudagraphCreated == false) { - Kokkos::fence(); - cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); - { - for (int iter = 0; iter < nlevels; ++iter) { - size_type lvl_nodes = hnodes_per_level(iter); - - using policy_type = ReturnTeamPolicyType; - - Kokkos::parallel_for( - "parfor_u_team_cudagraph", - Kokkos::Experimental::require( - ReturnTeamPolicyType::get_policy( - lvl_nodes, team_size, cuda1), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - UpperTriLvlSchedTP1SolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count)); - - node_count += hnodes_per_level(iter); - } + // Create graphExec + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, + NULL, 0); + thandle.cudagraphCreated = true; } - cudaStreamEndCapture(stream1, &graph); - - // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, - 0); - thandle.cudagraphCreated = true; - } - // Run graph - Kokkos::fence(); - cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); + // Run graph + Kokkos::fence(); + cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); - cudaStreamSynchronize(stream1); - Kokkos::fence(); -} // end upper_tri_solve_cg + cudaStreamSynchronize(stream1); + Kokkos::fence(); + } // end upper_tri_solve_cg #endif -template -void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + template + static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); + cudaProfilerStop(); #endif - typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - - auto nlevels = thandle.get_num_levels(); - // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + auto nlevels = thandle.get_num_levels(); + // Keep this a host View, create device version and copy to back to host + // during scheduling This requires making sure the host view in the handle + // is properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - using device_t = Kokkos::Device; - using integer_view_t = typename TriSolveHandle::integer_view_t; - using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; - using scalar_t = typename ValuesType::non_const_value_type; - using range_type = Kokkos::pair; - using row_map_host_view_t = Kokkos::View; - - row_map_host_view_t row_map_host; - - const scalar_t zero(0.0); - const scalar_t one(1.0); - - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); - - if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - - row_map_host = row_map_host_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); - } + using namespace KokkosSparse::Experimental; + using device_t = Kokkos::Device; + using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; + using row_map_host_view_t = Kokkos::View; + + row_map_host_view_t row_map_host; + + const scalar_t zero(0.0); + const scalar_t one(1.0); + + auto nodes_grouped_by_level_host = + thandle.get_host_nodes_grouped_by_level(); + + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - // inversion options - const bool invert_diagonal = thandle.get_invert_diagonal(); - const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); - const bool unit_diagonal = thandle.is_unit_diagonal(); + row_map_host = row_map_host_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); + Kokkos::deep_copy(row_map_host, row_map); + } + + // inversion options + const bool invert_diagonal = thandle.get_invert_diagonal(); + const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); + const bool unit_diagonal = thandle.is_unit_diagonal(); - // supernode sizes - const int *supercols = thandle.get_supercols(); - const int *supercols_host = thandle.get_supercols_host(); + // supernode sizes + const int *supercols = thandle.get_supercols(); + const int *supercols_host = thandle.get_supercols_host(); - // kernel types - integer_view_t kernel_type = thandle.get_kernel_type(); - integer_view_t diag_kernel_type = thandle.get_diag_kernel_type(); + // kernel types + work_view_int_t kernel_type = thandle.get_kernel_type(); + work_view_int_t diag_kernel_type = thandle.get_diag_kernel_type(); - integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); - integer_view_host_t diag_kernel_type_host = - thandle.get_diag_kernel_type_host(); + integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); + integer_view_host_t diag_kernel_type_host = + thandle.get_diag_kernel_type_host(); - // workspaces - integer_view_t work_offset = thandle.get_work_offset(); - integer_view_host_t work_offset_host = thandle.get_work_offset_host(); - auto work = thandle.get_workspace(); + // workspaces + work_view_int_t work_offset = thandle.get_work_offset(); + integer_view_host_t work_offset_host = thandle.get_work_offset_host(); + auto work = thandle.get_workspace(); #endif - size_type node_count = 0; + size_type node_count = 0; #ifdef profile_supernodal_etree - Kokkos::Timer sptrsv_timer; - sptrsv_timer.reset(); + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); #endif - for (size_type lvl = 0; lvl < nlevels; ++lvl) { - { + for (size_type lvl = 0; lvl < nlevels; ++lvl) { size_type lvl_nodes = hnodes_per_level(lvl); if (lvl_nodes != 0) { @@ -2980,27 +2756,24 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( - Kokkos::RangePolicy(space, node_count, - node_count + lvl_nodes), + range_policy(space, node_count, node_count + lvl_nodes), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedRPSolverFunctor( + ValuesType, LHSType, RHSType>( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - using team_policy_t = Kokkos::TeamPolicy; - int team_size = thandle.get_team_size(); + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count); #else LowerTriLvlSchedTP1SolverFunctor + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); #endif @@ -3008,14 +2781,14 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); else Kokkos::parallel_for( "parfor_l_team", Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, team_size), + team_policy(space, lvl_nodes, team_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } @@ -3051,10 +2824,10 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count, vector_size, 0); #else LowerTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, + LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups); #endif Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size @@ -3072,7 +2845,6 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #endif // NOTE: we currently supports only default_layout = LayoutLeft - using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = Kokkos::View; @@ -3084,13 +2856,13 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, if (invert_diagonal && !invert_offdiagonal) { // copy diagonals to workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -2, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } @@ -3175,21 +2947,21 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, if (invert_offdiagonal) { // copy diagonals from workspaces const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); Kokkos::parallel_for( "parfor_tri_supernode_spmv", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } } // launching sparse-triangular solve functor - LowerTriSupernodalFunctor + LowerTriSupernodalFunctor sptrsv_functor(unit_diagonal, invert_diagonal, invert_offdiagonal, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, @@ -3197,7 +2969,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_functor); @@ -3219,7 +2991,6 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; // update with spmv (one or two SpMV) bool transpose_spmv = @@ -3231,25 +3002,25 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, auto digmat = thandle.get_diagblock(lvl); KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + 1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_init_functor); } @@ -3259,13 +3030,13 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace - SparseTriSupernodalSpMVFunctor - sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor( + 0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, + work); Kokkos::parallel_for( "parfor_lsolve_supernode", Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), + team_policy(space, lvl_nodes, Kokkos::AUTO), Kokkos::Experimental::WorkItemProperty::HintLightWeight), sptrsv_finalize_functor); @@ -3284,165 +3055,159 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, cudaProfilerStop(); #endif } // end if - } // scope for if-block - - } // end for lvl + } // end for lvl #ifdef profile_supernodal_etree - Kokkos::fence(); - double sptrsv_time_seconds = sptrsv_timer.seconds(); - std::cout << " + Execution space : " << execution_space::name() - << std::endl; - std::cout << " + Memory space : " << memory_space::name() << std::endl; - std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl - << std::endl; + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds(); + std::cout << " + Execution space : " << execution_space::name() + << std::endl; + std::cout << " + Memory space : " << temp_mem_space::name() + << std::endl; + std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl + << std::endl; #endif -} // end lower_tri_solve + } // end lower_tri_solve -template -void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + template + static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); + cudaProfilerStop(); #endif - using memory_space = typename TriSolveHandle::HandleTempMemorySpace; - using device_t = Kokkos::Device; - typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - - auto nlevels = thandle.get_num_levels(); - // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); - // Kokkos::deep_copy(hnodes_per_level, nodes_per_level); - - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + using device_t = Kokkos::Device; + + auto nlevels = thandle.get_num_levels(); + // Keep this a host View, create device version and copy to back to host + // during scheduling This requires making sure the host view in the handle + // is properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); + // Kokkos::deep_copy(hnodes_per_level, nodes_per_level); + + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - using namespace KokkosSparse::Experimental; - using integer_view_t = typename TriSolveHandle::integer_view_t; - using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; - using scalar_t = typename ValuesType::non_const_value_type; - using range_type = Kokkos::pair; - using row_map_host_view_t = Kokkos::View; + using namespace KokkosSparse::Experimental; + using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; + using row_map_host_view_t = Kokkos::View; - row_map_host_view_t row_map_host; + row_map_host_view_t row_map_host; - const scalar_t zero(0.0); - const scalar_t one(1.0); + const scalar_t zero(0.0); + const scalar_t one(1.0); - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + auto nodes_grouped_by_level_host = + thandle.get_host_nodes_grouped_by_level(); - if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - row_map_host = row_map_host_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); - } + row_map_host = row_map_host_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); + Kokkos::deep_copy(row_map_host, row_map); + } - // supernode sizes - const int *supercols = thandle.get_supercols(); - const int *supercols_host = thandle.get_supercols_host(); + // supernode sizes + const int *supercols = thandle.get_supercols(); + const int *supercols_host = thandle.get_supercols_host(); - // inversion option - const bool invert_diagonal = thandle.get_invert_diagonal(); - const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); + // inversion option + const bool invert_diagonal = thandle.get_invert_diagonal(); + const bool invert_offdiagonal = thandle.get_invert_offdiagonal(); - // kernel types - integer_view_t kernel_type = thandle.get_kernel_type(); - integer_view_t diag_kernel_type = thandle.get_diag_kernel_type(); + // kernel types + work_view_int_t kernel_type = thandle.get_kernel_type(); + work_view_int_t diag_kernel_type = thandle.get_diag_kernel_type(); - integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); - integer_view_host_t diag_kernel_type_host = - thandle.get_diag_kernel_type_host(); + integer_view_host_t kernel_type_host = thandle.get_kernel_type_host(); + integer_view_host_t diag_kernel_type_host = + thandle.get_diag_kernel_type_host(); - // workspace - integer_view_t work_offset = thandle.get_work_offset(); - integer_view_host_t work_offset_host = thandle.get_work_offset_host(); - auto work = thandle.get_workspace(); + // workspace + work_view_int_t work_offset = thandle.get_work_offset(); + integer_view_host_t work_offset_host = thandle.get_work_offset_host(); + auto work = thandle.get_workspace(); #endif - size_type node_count = 0; + size_type node_count = 0; -// This must stay serial; would be nice to try out Cuda's graph stuff to reduce -// kernel launch overhead + // This must stay serial; would be nice to try out Cuda's graph stuff to + // reduce kernel launch overhead #ifdef profile_supernodal_etree - Kokkos::Timer sptrsv_timer; - sptrsv_timer.reset(); + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); #endif - for (size_type lvl = 0; lvl < nlevels; ++lvl) { - size_type lvl_nodes = hnodes_per_level(lvl); + for (size_type lvl = 0; lvl < nlevels; ++lvl) { + size_type lvl_nodes = hnodes_per_level(lvl); - if (lvl_nodes != 0) { + if (lvl_nodes != 0) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStart(); + cudaProfilerStart(); #endif - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, node_count, - node_count + lvl_nodes), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - UpperTriLvlSchedRPSolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - using team_policy_t = Kokkos::TeamPolicy; - - int team_size = thandle.get_team_size(); + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + Kokkos::Experimental::require( + range_policy(space, node_count, node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + UpperTriLvlSchedRPSolverFunctor( + row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); + } else if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - false, node_count); + TriLvlSchedTP1SolverFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + false, node_count); #else - UpperTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); + UpperTriLvlSchedTP1SolverFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count); #endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - else - Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, team_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } - // TP2 algorithm has issues with some offset-ordinal combo to be addressed - /* - else if ( thandle.get_algorithm() == -KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef -Kokkos::TeamPolicy tvt_policy_type; - - int team_size = thandle.get_team_size(); - if ( team_size == -1 ) { + if (team_size == -1) + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + else + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, team_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + } + // TP2 algorithm has issues with some offset-ordinal combo to be + // addressed + /* + else if ( thandle.get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { +typedef Kokkos::TeamPolicy tvt_policy_type; + + int team_size = thandle.get_team_size(); + if ( team_size == -1 ) { team_size = std::is_same< typename -Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : -64; + Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace +>::value ? 1 : 64; } int vector_size = thandle.get_team_size(); if ( vector_size == -1 ) { @@ -3461,10 +3226,10 @@ node_group (thread has full ownership of a node) #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, +LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count, vector_size, 0); #else UpperTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, +LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups); #endif Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( @@ -3472,105 +3237,208 @@ nodes_grouped_by_level, node_count, node_groups); #endif tstf); } // end elseif */ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || - thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { #ifdef profile_supernodal_etree - size_t flops = 0; - Kokkos::Timer timer; - timer.reset(); + size_t flops = 0; + Kokkos::Timer timer; + timer.reset(); #endif - using team_policy_type = Kokkos::TeamPolicy; - if (thandle.is_column_major()) { // U stored in CSC - if (diag_kernel_type_host(lvl) == 3) { - // using device-level kernels (functor is called to gather the input - // into workspace) - scalar_t *dataU = const_cast(values.data()); + if (thandle.is_column_major()) { // U stored in CSC + if (diag_kernel_type_host(lvl) == 3) { + // using device-level kernels (functor is called to gather the + // input into workspace) + scalar_t *dataU = const_cast(values.data()); + + if (invert_diagonal && !invert_offdiagonal) { + // copy diagonals to workspaces + const int *work_offset_data = work_offset.data(); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -2, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty:: + HintLightWeight), + sptrsv_init_functor); + } + for (size_type league_rank = 0; league_rank < lvl_nodes; + league_rank++) { + auto s = nodes_grouped_by_level_host(node_count + league_rank); + + // supernodal column size + int j1 = supercols_host[s]; + int j2 = supercols_host[s + 1]; + int nscol = + j2 - j1; // number of columns in the s-th supernode column + + int i1 = row_map_host(j1); + int i2 = row_map_host(j1 + 1); + int nsrow = i2 - i1; // "total" number of rows in all the + // supernodes (diagonal+off-diagonal) + int nsrow2 = nsrow - nscol; // "total" number of rows in all + // the off-diagonal supernodes +#ifdef profile_supernodal_etree + flops += 2 * (nscol * nsrow); +#endif - if (invert_diagonal && !invert_offdiagonal) { - // copy diagonals to workspaces - const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); - Kokkos::parallel_for( - "parfor_tri_supernode_spmv", - Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); - } - for (size_type league_rank = 0; league_rank < lvl_nodes; - league_rank++) { - auto s = nodes_grouped_by_level_host(node_count + league_rank); + // workspace + int workoffset = work_offset_host(s); - // supernodal column size - int j1 = supercols_host[s]; - int j2 = supercols_host[s + 1]; - int nscol = - j2 - j1; // number of columns in the s-th supernode column + // create a view for the s-th supernocal block column + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View + viewU(&dataU[i1], nsrow, nscol); - int i1 = row_map_host(j1); - int i2 = row_map_host(j1 + 1); - int nsrow = i2 - i1; // "total" number of rows in all the - // supernodes (diagonal+off-diagonal) - int nsrow2 = nsrow - nscol; // "total" number of rows in all the - // off-diagonal supernodes -#ifdef profile_supernodal_etree - flops += 2 * (nscol * nsrow); -#endif + if (invert_offdiagonal) { + auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), + Kokkos::ALL()); + auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); + auto Z = Kokkos::subview( + work, + range_type( + workoffset, + workoffset + + nsrow)); // needed with gemv for update&scatter + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); + } else { + // extract part of the solution, corresponding to the diagonal + // block + auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); - // workspace - int workoffset = work_offset_host(s); + // "triangular-solve" to compute Xj + // extract the diagonal block of s-th supernocal column of U + auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), + Kokkos::ALL()); + if (invert_diagonal) { + auto Y = Kokkos::subview( + work, range_type( + workoffset, + workoffset + nscol)); // needed for gemv + // instead of trmv/trsv + KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); + } else { + // NOTE: we currently supports only default_layout = + // LayoutLeft + Kokkos::View + Xjj(Xj.data(), nscol, 1); + KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); + } + // update off-diagonal blocks + if (nsrow2 > 0) { + // extract the off-diagonal blocks of s-th supernodal column + // of U + auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), + Kokkos::ALL()); + auto Z = Kokkos::subview( + work, range_type(workoffset + nscol, + workoffset + nscol + + nsrow2)); // needed with gemv for + // update&scatter + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); + } + } + } + if (invert_offdiagonal) { + // copy diagonals from workspaces + const int *work_offset_data = work_offset.data(); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty:: + HintLightWeight), + sptrsv_init_functor); + } + } - // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View - viewU(&dataU[i1], nsrow, nscol); + // launching sparse-triangular solve functor + UpperTriTranSupernodalFunctor + sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols, + row_map, entries, values, lvl, kernel_type, + diag_kernel_type, lhs, work, work_offset, + nodes_grouped_by_level, node_count); - if (invert_offdiagonal) { - auto Uij = - Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); + Kokkos::parallel_for( + "parfor_usolve_tran_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); + } else { // U stored in CSR + // launching sparse-triangular solve functor + UpperTriSupernodalFunctor + sptrsv_functor(invert_diagonal, supercols, row_map, entries, + values, lvl, kernel_type, diag_kernel_type, lhs, + work, work_offset, nodes_grouped_by_level, + node_count); + + Kokkos::parallel_for( + "parfor_usolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); + + if (diag_kernel_type_host(lvl) == 3) { + // using device-level kernels (functor is called to gather the + // input into workspace) + scalar_t *dataU = const_cast(values.data()); + + for (size_type league_rank = 0; league_rank < lvl_nodes; + league_rank++) { + auto s = nodes_grouped_by_level_host(node_count + league_rank); + + // supernodal column size + int j1 = supercols_host[s]; + int j2 = supercols_host[s + 1]; + int nscol = + j2 - j1; // number of columns in the s-th supernode column + + // "total" number of rows in all the supernodes + // (diagonal+off-diagonal) + int i1 = row_map_host(j1); + int i2 = row_map_host(j1 + 1); + int nsrow = i2 - i1; + // "total" number of rows in all the off-diagonal supernodes + int nsrow2 = nsrow - nscol; + + // workspace + int workoffset = work_offset_host(s); + + // create a view for the s-th supernocal block column + // NOTE: we currently supports only default_layout = LayoutLeft + Kokkos::View + viewU(&dataU[i1], nsrow, nscol); + + // extract part of the solution, corresponding to the diagonal + // block auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); - auto Z = Kokkos::subview( + auto Y = Kokkos::subview( work, range_type( workoffset, workoffset + - nsrow)); // needed with gemv for update&scatter - KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); - } else { - // extract part of the solution, corresponding to the diagonal - // block - auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); + nscol)); // needed for gemv instead of trmv/trsv - // "triangular-solve" to compute Xj - // extract the diagonal block of s-th supernocal column of U - auto Ujj = - Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); - if (invert_diagonal) { - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + - nscol)); // needed for gemv instead of trmv/trsv - KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); - } else { - // NOTE: we currently supports only default_layout = - // LayoutLeft - Kokkos::View - Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); - } - // update off-diagonal blocks + // update with off-diagonal blocks if (nsrow2 > 0) { // extract the off-diagonal blocks of s-th supernodal column - // of U + // of + // U auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); auto Z = Kokkos::subview( @@ -3579,720 +3447,607 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); } - } - } - if (invert_offdiagonal) { - // copy diagonals from workspaces - const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); - Kokkos::parallel_for( - "parfor_tri_supernode_spmv", - Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); - } - } - - // launching sparse-triangular solve functor - UpperTriTranSupernodalFunctor - sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols, - row_map, entries, values, lvl, kernel_type, - diag_kernel_type, lhs, work, work_offset, - nodes_grouped_by_level, node_count); - - using team_policy_t = Kokkos::TeamPolicy; - Kokkos::parallel_for( - "parfor_usolve_tran_supernode", - Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_functor); - } else { // U stored in CSR - // launching sparse-triangular solve functor - UpperTriSupernodalFunctor - sptrsv_functor(invert_diagonal, supercols, row_map, entries, - values, lvl, kernel_type, diag_kernel_type, lhs, - work, work_offset, nodes_grouped_by_level, - node_count); - - using team_policy_t = Kokkos::TeamPolicy; - Kokkos::parallel_for( - "parfor_usolve_supernode", - Kokkos::Experimental::require( - team_policy_t(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_functor); - - if (diag_kernel_type_host(lvl) == 3) { - // using device-level kernels (functor is called to gather the input - // into workspace) - scalar_t *dataU = const_cast(values.data()); - - for (size_type league_rank = 0; league_rank < lvl_nodes; - league_rank++) { - auto s = nodes_grouped_by_level_host(node_count + league_rank); - - // supernodal column size - int j1 = supercols_host[s]; - int j2 = supercols_host[s + 1]; - int nscol = - j2 - j1; // number of columns in the s-th supernode column - // "total" number of rows in all the supernodes - // (diagonal+off-diagonal) - int i1 = row_map_host(j1); - int i2 = row_map_host(j1 + 1); - int nsrow = i2 - i1; - // "total" number of rows in all the off-diagonal supernodes - int nsrow2 = nsrow - nscol; - - // workspace - int workoffset = work_offset_host(s); - - // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View - viewU(&dataU[i1], nsrow, nscol); - - // extract part of the solution, corresponding to the diagonal - // block - auto Xj = Kokkos::subview(lhs, range_type(j1, j2)); - auto Y = Kokkos::subview( - work, - range_type( - workoffset, - workoffset + - nscol)); // needed for gemv instead of trmv/trsv - - // update with off-diagonal blocks - if (nsrow2 > 0) { - // extract the off-diagonal blocks of s-th supernodal column of - // U - auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), - Kokkos::ALL()); - auto Z = Kokkos::subview( - work, - range_type( - workoffset + nscol, - workoffset + nscol + - nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); + // "triangular-solve" to compute Xj + // extract the diagonal block of s-th supernocal column of U + auto Ujj = + Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); + if (invert_diagonal) { + KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); + } else { + // NOTE: we currently supports only default_layout = + // LayoutLeft + Kokkos::View + Xjj(Xj.data(), nscol, 1); + KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); + } } - - // "triangular-solve" to compute Xj - // extract the diagonal block of s-th supernocal column of U - auto Ujj = - Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { - KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); - } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View - Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); + // copy diagonals from workspaces + const int *work_offset_data = work_offset.data(); + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, + work_offset_data, lhs, work); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty:: + HintLightWeight), + sptrsv_init_functor); } } - if (invert_diagonal) { - // copy diagonals from workspaces - const int *work_offset_data = work_offset.data(); - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, work_offset_data, lhs, work); - Kokkos::parallel_for( - "parfor_tri_supernode_spmv", - Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); - } } - } #ifdef profile_supernodal_etree - Kokkos::fence(); - double time_seconds = timer.seconds(); - std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds - << " flop count: " << flops - << " kernel-type: " << kernel_type_host(lvl) - << " # of supernodes: " << lvl_nodes << std::endl; + Kokkos::fence(); + double time_seconds = timer.seconds(); + std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds + << " flop count: " << flops + << " kernel-type: " << kernel_type_host(lvl) + << " # of supernodes: " << lvl_nodes << std::endl; #endif - } else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV || - thandle.get_algorithm() == - SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { + } else if (thandle.get_algorithm() == + SPTRSVAlgorithm::SUPERNODAL_SPMV || + thandle.get_algorithm() == + SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { #ifdef profile_supernodal_etree - Kokkos::Timer timer; - timer.reset(); + Kokkos::Timer timer; + timer.reset(); #endif - // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + // initialize input & output vectors - // update with one, or two, spmv - bool transpose_spmv = - ((!thandle.transpose_spmv() && thandle.is_column_major()) || - (thandle.transpose_spmv() && !thandle.is_column_major())); - const char *tran = (transpose_spmv ? "T" : "N"); - if (!transpose_spmv) { // U stored in CSR - if (!invert_offdiagonal) { - // solve with diagonals - auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); - // copy from work to lhs corresponding to diagonal blocks - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); - Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + // update with one, or two, spmv + bool transpose_spmv = + ((!thandle.transpose_spmv() && thandle.is_column_major()) || + (thandle.transpose_spmv() && !thandle.is_column_major())); + const char *tran = (transpose_spmv ? "T" : "N"); + if (!transpose_spmv) { // U stored in CSR + if (!invert_offdiagonal) { + // solve with diagonals + auto digmat = thandle.get_diagblock(lvl); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); + // copy from work to lhs corresponding to diagonal blocks + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + -1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); + } else { + // zero out lhs corresponding to diagonal blocks in lhs, and copy + // to work + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + 1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); + } + // update with off-diagonals (potentiall combined with diagonal + // solves) + auto submat = thandle.get_submatrix(lvl); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); } else { - // zero out lhs corresponding to diagonal blocks in lhs, and copy to - // work - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); - Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); - } - // update with off-diagonals (potentiall combined with diagonal - // solves) - auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); - } else { - if (!invert_offdiagonal) { - // zero out lhs corresponding to diagonal blocks in lhs, and copy to - // work - SparseTriSupernodalSpMVFunctor - sptrsv_init_functor(1, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); - Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_init_functor); + if (!invert_offdiagonal) { + // zero out lhs corresponding to diagonal blocks in lhs, and copy + // to work + SparseTriSupernodalSpMVFunctor sptrsv_init_functor( + 1, node_count, nodes_grouped_by_level, supercols, supercols, + lhs, work); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); - // update with off-diagonals - auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); + // update with off-diagonals + auto submat = thandle.get_submatrix(lvl); + KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); - // solve with diagonals - auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); - } else { - std::cout << " ** invert_offdiag with U in CSR not supported **" - << std::endl; + // solve with diagonals + auto digmat = thandle.get_diagblock(lvl); + KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); + } else { + std::cout << " ** invert_offdiag with U in CSR not supported **" + << std::endl; + } } - } - // reinitialize workspace - SparseTriSupernodalSpMVFunctor - sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, - supercols, supercols, lhs, work); - Kokkos::parallel_for( - "parfor_lsolve_supernode", - Kokkos::Experimental::require( - team_policy_type(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - sptrsv_finalize_functor); + // reinitialize workspace + SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor( + 0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, + work); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree - Kokkos::fence(); - double time_seconds = timer.seconds(); - std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds - << " kernel-type: " << kernel_type_host(lvl) - << " # of supernodes: " << lvl_nodes << std::endl; + Kokkos::fence(); + double time_seconds = timer.seconds(); + std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds + << " kernel-type: " << kernel_type_host(lvl) + << " # of supernodes: " << lvl_nodes << std::endl; #endif - } + } #endif - node_count += lvl_nodes; + node_count += lvl_nodes; #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); + cudaProfilerStop(); #endif - } // end if - } // end for lvl + } // end if + } // end for lvl #ifdef profile_supernodal_etree - Kokkos::fence(); - double sptrsv_time_seconds = sptrsv_timer.seconds(); - std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl - << std::endl; - std::cout << " + Execution space : " << ExecutionSpace::name() - << std::endl; - std::cout << " + Memory space : " << memory_space::name() << std::endl; + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds(); + std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl + << std::endl; + std::cout << " + Execution space : " << execution_space::name() + << std::endl; + std::cout << " + Memory space : " << temp_mem_space::name() + << std::endl; #endif -} // end upper_tri_solve + } // end upper_tri_solve -template -void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, - const RowMapType row_map, const EntriesType entries, - const ValuesType values, const RHSType &rhs, LHSType &lhs, - const bool /*is_lowertri_*/) { + template + static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, + const RowMapType row_map, + const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs, const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) - cudaProfilerStop(); -#endif - typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - - // Algorithm is checked before this function is called - auto h_chain_ptr = thandle.get_host_chain_ptr(); - size_type num_chain_entries = thandle.get_num_chain_entries(); - - // Keep this a host View, create device version and copy to back to host - // during scheduling This requires making sure the host view in the handle is - // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - - const bool is_lowertri = thandle.is_lower_tri(); - - size_type node_count = 0; - - // REFACTORED to cleanup; next, need debug and timer routines - using policy_type = Kokkos::TeamPolicy; - using large_cutoff_policy_type = - Kokkos::TeamPolicy; - /* - using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = - LowerTriLvlSchedTP1SolverFunctor; using UTP1Functor = - UpperTriLvlSchedTP1SolverFunctor; using LSingleBlockFunctor = - LowerTriLvlSchedTP1SingleBlockFunctor; using USingleBlockFunctor = - UpperTriLvlSchedTP1SingleBlockFunctor; - */ - using SingleBlockFunctor = - TriLvlSchedTP1SingleBlockFunctor; - - int team_size = thandle.get_team_size(); - int vector_size = - thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1; - - auto cutoff = thandle.get_chain_threshold(); - int team_size_singleblock = team_size; - - // Enumerate options - // ts -1,0 | cu 0 - select default ts == 1 - // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto) - // ts > 0 | cu 0 - set - // ts > 0 | cu > 0 - set - // Controls ts,cu > 0 - // co > ts - not all rows can be mapped to a thread - must call largercutoff - // impl co <= ts - okay, kernel must be careful not to access out-of-bounds; - // some threads idol - if (team_size_singleblock <= 0 && cutoff == 0) { - team_size_singleblock = 1; - // If cutoff == 0, no single-block calls will be made, team_size_singleblock - // is unimportant - } - - // This is only necessary for Lower,UpperTri functor versions; else, - // is_lowertri can be passed as arg to the generic Tri functor... - if (is_lowertri) { - for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); - - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - true, node_count); -#else - LowerTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); + cudaProfilerStop(); #endif - if (team_size == -1) { - team_size = - policy_type(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } - - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for( - "parfor_l_team_chain1", - Kokkos::Experimental::require( - policy_type(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - node_count += lvl_nodes; - - } else { - size_type lvl_nodes = 0; - - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); - } + // Algorithm is checked before this function is called + auto h_chain_ptr = thandle.get_host_chain_ptr(); + size_type num_chain_entries = thandle.get_num_chain_entries(); + + // Keep this a host View, create device version and copy to back to host + // during scheduling This requires making sure the host view in the handle + // is properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + + const bool is_lowertri = thandle.is_lower_tri(); + + size_type node_count = 0; + + // REFACTORED to cleanup; next, need debug and timer routines + using large_cutoff_policy_type = + Kokkos::TeamPolicy; + /* + using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = + LowerTriLvlSchedTP1SolverFunctor; using UTP1Functor = + UpperTriLvlSchedTP1SolverFunctor; using LSingleBlockFunctor = + LowerTriLvlSchedTP1SingleBlockFunctor; using USingleBlockFunctor = + UpperTriLvlSchedTP1SingleBlockFunctor; + */ + using SingleBlockFunctor = + TriLvlSchedTP1SingleBlockFunctor; + + int team_size = thandle.get_team_size(); + int vector_size = + thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1; + + auto cutoff = thandle.get_chain_threshold(); + int team_size_singleblock = team_size; + + // Enumerate options + // ts -1,0 | cu 0 - select default ts == 1 + // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto) + // ts > 0 | cu 0 - set + // ts > 0 | cu > 0 - set + // Controls ts,cu > 0 + // co > ts - not all rows can be mapped to a thread - must call + // largercutoff impl co <= ts - okay, kernel must be careful not to access + // out-of-bounds; some threads idol + if (team_size_singleblock <= 0 && cutoff == 0) { + team_size_singleblock = 1; + // If cutoff == 0, no single-block calls will be made, + // team_size_singleblock is unimportant + } - if (team_size_singleblock <= 0) { - team_size_singleblock = - policy_type(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), - Kokkos::ParallelForTag()); - } + // This is only necessary for Lower,UpperTri functor versions; else, + // is_lowertri can be passed as arg to the generic Tri functor... + if (is_lowertri) { + for (size_type chainlink = 0; chainlink < num_chain_entries; + ++chainlink) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink + 1); - if (cutoff <= team_size_singleblock) { + if (echain - schain == 1) { + // if team_size is -1 (unset), get recommended size from Kokkos #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true); + true, node_count); #else - LowerTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + LowerTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); + node_count); #endif + if (team_size == -1) { + team_size = + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); + } + + size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? Kokkos::parallel_for( - "parfor_l_team_chainmulti", + "parfor_l_team_chain1", Kokkos::Experimental::require( - policy_type(space, 1, team_size_singleblock, vector_size), + team_policy(space, lvl_nodes, team_size, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); + node_count += lvl_nodes; + } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally + size_type lvl_nodes = 0; + + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); + } + + if (team_size_singleblock <= 0) { + team_size_singleblock = + team_policy(space, 1, 1, vector_size) + .team_size_recommended( + SingleBlockFunctor(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain, is_lowertri), + Kokkos::ParallelForTag()); + } + + if (cutoff <= team_size_singleblock) { #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true, 0, - cutoff); + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, true); #else - LowerTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); + LowerTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain); #endif - Kokkos::parallel_for( - "parfor_l_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } - node_count += lvl_nodes; - } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? - } - - } else { - for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); - - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos + Kokkos::parallel_for( + "parfor_l_team_chainmulti", + Kokkos::Experimental::require( + team_policy(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + } else { + // team_size_singleblock < cutoff => kernel must allow for a + // block-stride internally #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - is_lowertri, node_count); + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, true, 0, + cutoff); #else - UpperTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); + LowerTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, cutoff); #endif - if (team_size == -1) { - team_size = - policy_type(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } - - // TODO To use cudagraph here, need to know how many non-unit chains - // there are, create a graph for each and launch accordingly - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for( - "parfor_u_team_chain1", - Kokkos::Experimental::require( - policy_type(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - node_count += lvl_nodes; - - } else { - size_type lvl_nodes = 0; - - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); + Kokkos::parallel_for( + "parfor_l_team_chainmulti_cutoff", + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + } + node_count += lvl_nodes; } + // TODO: space.fence() + Kokkos::fence(); // TODO - is this necessary? that is, can the + // parallel_for launch before the s/echain values have + // been updated? + } - if (team_size_singleblock <= 0) { - // team_size_singleblock = policy_type(1, 1, - // 1).team_size_recommended(SingleBlockFunctor(row_map, entries, - // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), - // Kokkos::ParallelForTag()); - team_size_singleblock = - policy_type(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), - Kokkos::ParallelForTag()); - } + } else { + for (size_type chainlink = 0; chainlink < num_chain_entries; + ++chainlink) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink + 1); - if (cutoff <= team_size_singleblock) { + if (echain - schain == 1) { + // if team_size is -1 (unset), get recommended size from Kokkos #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri); + is_lowertri, node_count); #else - UpperTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + UpperTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); + node_count); #endif + if (team_size == -1) { + team_size = + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); + } + + // TODO To use cudagraph here, need to know how many non-unit chains + // there are, create a graph for each and launch accordingly + size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? Kokkos::parallel_for( - "parfor_u_team_chainmulti", + "parfor_u_team_chain1", Kokkos::Experimental::require( - policy_type(space, 1, team_size_singleblock, vector_size), + team_policy(space, lvl_nodes, team_size, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); + node_count += lvl_nodes; + } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally + size_type lvl_nodes = 0; + + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); + } + + if (team_size_singleblock <= 0) { + // team_size_singleblock = team_policy(1, 1, + // 1).team_size_recommended(SingleBlockFunctor(row_map, entries, + // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, + // node_count), Kokkos::ParallelForTag()); + team_size_singleblock = + team_policy(space, 1, 1, vector_size) + .team_size_recommended( + SingleBlockFunctor(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain, is_lowertri), + Kokkos::ParallelForTag()); + } + + if (cutoff <= team_size_singleblock) { #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri, 0, - cutoff); + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, is_lowertri); #else - UpperTriLvlSchedTP1SingleBlockFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); + UpperTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain); #endif - Kokkos::parallel_for( - "parfor_u_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + Kokkos::parallel_for( + "parfor_u_team_chainmulti", + Kokkos::Experimental::require( + team_policy(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + } else { + // team_size_singleblock < cutoff => kernel must allow for a + // block-stride internally +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, is_lowertri, + 0, cutoff); +#else + UpperTriLvlSchedTP1SingleBlockFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + nodes_per_level, node_count, schain, echain, cutoff); +#endif + Kokkos::parallel_for( + "parfor_u_team_chainmulti_cutoff", + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + } + node_count += lvl_nodes; } - node_count += lvl_nodes; + // TODO: space.fence() + Kokkos::fence(); // TODO - is this necessary? that is, can the + // parallel_for launch before the s/echain values have + // been updated? } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? } - } - -} // end tri_solve_chain - -// -------------------------------- -// Stream interfaces -// -------------------------------- - -template -void lower_tri_solve_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, - std::vector &lhs_v) { - // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment - using size_type = typename TriSolveHandle::size_type; - using NGBLType = typename TriSolveHandle::nnz_lno_view_t; - using nodes_per_level_type = - typename TriSolveHandle::hostspace_nnz_lno_view_t; - using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector hnodes_per_level_v(nstreams); - std::vector nodes_grouped_by_level_v(nstreams); - std::vector node_count_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); - nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; - } - - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // 1. Launch work on all streams + } // end tri_solve_chain + + // -------------------------------- + // Stream interfaces + // -------------------------------- + template + static void lower_tri_solve_streams( + const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment + using nodes_per_level_type = + typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - // Only if stream i-th still has this level - if (lvl < nlevels_v[i]) { - size_type lvl_nodes = hnodes_per_level_v[i](lvl); - if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - Kokkos::RangePolicy( - execspace_v[i], node_count_v[i], - node_count_v[i] + lvl_nodes), - LowerTriLvlSchedRPSolverFunctor( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], - nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[i]->get_team_size(); + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + range_policy(execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + LowerTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], true, - node_count_v[i]); + TriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], true, + node_count_v[i]); #else - LowerTriLvlSchedTP1SolverFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); + LowerTriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, team_size), tstf); - } - node_count_v[i] += lvl_nodes; - } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams - } // end for lvl -} // end lower_tri_solve_streams - -template -void upper_tri_solve_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, - std::vector &lhs_v) { - // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment - using size_type = typename TriSolveHandle::size_type; - using NGBLType = typename TriSolveHandle::nnz_lno_view_t; - using nodes_per_level_type = - typename TriSolveHandle::hostspace_nnz_lno_view_t; - using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector hnodes_per_level_v(nstreams); - std::vector nodes_grouped_by_level_v(nstreams); - std::vector node_count_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); - nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; - } - - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // 1. Launch work on all streams + if (team_size == -1) + Kokkos::parallel_for( + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for( + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + } + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + } // end for lvl + } // end lower_tri_solve_streams + + template + static void upper_tri_solve_streams( + const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment + using nodes_per_level_type = + typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - // Only if stream i-th still has this level - if (lvl < nlevels_v[i]) { - size_type lvl_nodes = hnodes_per_level_v[i](lvl); - if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - Kokkos::RangePolicy( - execspace_v[i], node_count_v[i], - node_count_v[i] + lvl_nodes), - UpperTriLvlSchedRPSolverFunctor( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], - nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[i]->get_team_size(); + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + range_policy(execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + UpperTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], false, - node_count_v[i]); + TriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], false, + node_count_v[i]); #else - UpperTriLvlSchedTP1SolverFunctor< - RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); + UpperTriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - policy_type(execspace_v[i], lvl_nodes, team_size), tstf); - } - node_count_v[i] += lvl_nodes; - } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams - } // end for lvl -} // end upper_tri_solve_streams + if (team_size == -1) + Kokkos::parallel_for( + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for( + "parfor_l_team", + team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + } + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + } // end for lvl + } // end upper_tri_solve_streams + +}; // struct SptrsvWrap } // namespace Experimental } // namespace Impl diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 6ad321c286..d69c499c60 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -120,6 +120,9 @@ struct SPTRSV_SOLVE; + // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() @@ -132,19 +135,19 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, - values, b, x, true); + Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, + b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, + b, x); else #endif - Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::lower_tri_solve(space, *sptrsv_handle, row_map, entries, + values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { @@ -153,19 +156,19 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, - values, b, x, false); + Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, + b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, + b, x); else #endif - Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::upper_tri_solve(space, *sptrsv_handle, row_map, entries, + values, b, x); } } Kokkos::Profiling::popRegion(); @@ -178,6 +181,8 @@ struct SPTRSV_SOLVE &entries_v, const std::vector &values_v, const std::vector &b_v, std::vector &x_v) { + using Sptrsv = + Experimental::SptrsvWrap; // Call specific algorithm type // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment // Assume streams have the same either lower or upper matrix type @@ -197,9 +202,8 @@ struct SPTRSV_SOLVE(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { @@ -208,9 +212,8 @@ struct SPTRSV_SOLVEsptrsvHandle; } void create_sptrsv_handle(KokkosSparse::Experimental::SPTRSVAlgorithm algm, - size_type nrows, bool lower_tri) { + size_type nrows, bool lower_tri, + size_type block_size = 0) { this->destroy_sptrsv_handle(); this->is_owner_of_the_sptrsv_handle = true; - this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri); + this->sptrsvHandle = + new SPTRSVHandleType(algm, nrows, lower_tri, block_size); // this->sptrsvHandle->init_handle(nrows); this->sptrsvHandle->set_team_size(this->team_work_size); this->sptrsvHandle->set_vector_size(this->vector_size); diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index cf23bfdc1f..fb322b7f95 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -56,76 +56,79 @@ template class SPTRSVHandle { public: - typedef ExecutionSpace HandleExecSpace; - typedef TemporaryMemorySpace HandleTempMemorySpace; - typedef PersistentMemorySpace HandlePersistentMemorySpace; - - typedef ExecutionSpace execution_space; - typedef HandlePersistentMemorySpace memory_space; - - typedef typename std::remove_const::type size_type; - typedef const size_type const_size_type; - - typedef typename std::remove_const::type nnz_lno_t; - typedef const nnz_lno_t const_nnz_lno_t; - - typedef typename std::remove_const::type scalar_t; - typedef const scalar_t const_nnz_scalar_t; - - // row_map type (managed memory) - typedef typename Kokkos::View - nnz_row_view_temp_t; - typedef typename Kokkos::View - nnz_row_view_t; - typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t; - typedef typename Kokkos::View - int_row_view_t; - typedef typename Kokkos::View - int64_row_view_t; + using HandleExecSpace = ExecutionSpace; + using HandleTempMemorySpace = TemporaryMemorySpace; + using HandlePersistentMemorySpace = PersistentMemorySpace; + + using execution_space = ExecutionSpace; + using memory_space = HandlePersistentMemorySpace; + + using TeamPolicy = Kokkos::TeamPolicy; + using RangePolicy = Kokkos::RangePolicy; + + using size_type = typename std::remove_const::type; + using const_size_type = const size_type; + + using nnz_lno_t = typename std::remove_const::type; + using const_nnz_lno_t = const nnz_lno_t; + + using scalar_t = typename std::remove_const::type; + using const_nnz_scalar_t = const scalar_t; + + // Row_map type (managed memory) + using nnz_row_view_temp_t = + typename Kokkos::View; + using nnz_row_view_t = + typename Kokkos::View; + using host_nnz_row_view_t = typename nnz_row_view_t::HostMirror; + using int_row_view_t = + typename Kokkos::View; + using int64_row_view_t = + typename Kokkos::View; // typedef typename row_lno_persistent_work_view_t::HostMirror // row_lno_persistent_work_host_view_t; //Host view type - typedef typename Kokkos::View< + using nnz_row_unmanaged_view_t = typename Kokkos::View< const size_type *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits> - nnz_row_unmanaged_view_t; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // values type (managed memory) - typedef typename Kokkos::View - nnz_scalar_view_temp_t; - typedef typename Kokkos::View - nnz_scalar_view_t; - typedef typename nnz_scalar_view_t::HostMirror host_nnz_scalar_view_t; - typedef typename Kokkos::View< + using nnz_scalar_view_temp_t = + typename Kokkos::View; + using nnz_scalar_view_t = + typename Kokkos::View; + using host_nnz_scalar_view_t = typename nnz_scalar_view_t::HostMirror; + using nnz_scalar_unmanaged_view_t = typename Kokkos::View< const scalar_t *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits> - nnz_scalar_unmanaged_view_t; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // entries type (managed memory) - typedef typename Kokkos::View - nnz_lno_view_temp_t; - typedef typename Kokkos::View - nnz_lno_view_t; - typedef typename Kokkos::View - hostspace_nnz_lno_view_t; - typedef typename nnz_lno_view_t::HostMirror host_nnz_lno_view_t; - typedef typename Kokkos::View< + using nnz_lno_view_temp_t = + typename Kokkos::View; + using nnz_lno_view_t = + typename Kokkos::View; + using hostspace_nnz_lno_view_t = + typename Kokkos::View; + using host_nnz_lno_view_t = typename nnz_lno_view_t::HostMirror; + using nnz_lno_unmanaged_view_t = typename Kokkos::View< const nnz_lno_t *, HandlePersistentMemorySpace, - Kokkos::MemoryTraits> - nnz_lno_unmanaged_view_t; // for rank1 subviews + Kokkos::MemoryTraits>; // for rank1 subviews // typedef typename nnz_lno_persistent_work_view_t::HostMirror // nnz_lno_persistent_work_host_view_t; //Host view type - typedef typename std::make_signed< - typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; - typedef Kokkos::View - signed_nnz_lno_view_t; - typedef typename signed_nnz_lno_view_t::HostMirror host_signed_nnz_lno_view_t; + using signed_integral_t = typename std::make_signed< + typename nnz_row_view_t::non_const_value_type>::type; + using signed_nnz_lno_view_t = + Kokkos::View; - typedef typename Kokkos::View - mtx_scalar_view_t; + using host_signed_nnz_lno_view_t = typename signed_nnz_lno_view_t::HostMirror; + + using mtx_scalar_view_t = + typename Kokkos::View; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #if (CUDA_VERSION >= 11030) @@ -214,7 +217,7 @@ class SPTRSVHandle { }; #endif - typedef cuSparseHandleType SPTRSVcuSparseHandleType; + using SPTRSVcuSparseHandleType = cuSparseHandleType; #endif #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT @@ -228,7 +231,7 @@ class SPTRSVHandle { //~cudaGraphWrapperType() { } }; - typedef cudaGraphWrapperType SPTRSVcudaGraphWrapperType; + using SPTRSVcudaGraphWrapperType = cudaGraphWrapperType; void create_SPTRSVcudaGraphWrapperType() { destroy_SPTRSVcudaGraphWrapperType(); @@ -296,6 +299,7 @@ class SPTRSVHandle { nnz_lno_view_t nodes_grouped_by_level; hostspace_nnz_lno_view_t hnodes_grouped_by_level; // NEW size_type nlevel; + size_type block_size; // block_size > 0 implies BSR int team_size; int vector_size; @@ -423,7 +427,8 @@ class SPTRSVHandle { public: SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, - bool symbolic_complete_ = false, bool numeric_complete_ = false) + const size_type block_size_ = 0, bool symbolic_complete_ = false, + bool numeric_complete_ = false) : #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT cudagraphCreated(false), @@ -438,6 +443,7 @@ class SPTRSVHandle { nodes_grouped_by_level(), hnodes_grouped_by_level(), nlevel(0), + block_size(block_size_), team_size(-1), vector_size(-1), stored_diagonal(false), @@ -1007,6 +1013,14 @@ class SPTRSVHandle { void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + + KOKKOS_INLINE_FUNCTION + void set_block_size(const size_type block_size_) { + this->block_size = block_size_; + } + void set_symbolic_complete() { this->symbolic_complete = true; } void set_symbolic_incomplete() { this->symbolic_complete = false; } From cb7a552b75ddf7648782b127e82e8210bd4bca48 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Wed, 10 Jul 2024 20:21:19 -0600 Subject: [PATCH 296/326] Sparse - SpMV: removing calls to unsuported oneapi - MKL functions (#2274) --- sparse/src/KokkosSparse_spmv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 5fa0be3619..de98701b7c 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -281,7 +281,7 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], #ifdef KOKKOS_ENABLE_SYCL if constexpr (std::is_same_v) { - useNative = useNative || (mode[0] == Conjugate[0]); + useNative = useNative || (mode[0] != NoTranspose[0]); } #endif #endif From aaa634b6c4a8b97d24fea68b863cbe9dd615764b Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 11 Jul 2024 18:52:28 -0600 Subject: [PATCH 297/326] Sycl gemv beta (#2276) * BLAS - GEMV: zero out Y when beta == 0 in SYCL TPL code path * BLAS - GEMV: reverting wrong change from previous PR, my bad. * Applying clang-format --- blas/src/KokkosBlas2_gemv.hpp | 5 ++--- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index e68f2cca75..88ffc63810 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -165,9 +165,8 @@ void gemv(const ExecutionSpace& space, const char trans[], // oneMKL supports both row-major and column-major of A // but only supports oneapi::mkl::transpose::nontrans op useFallback = - useFallback || ((tolower(*trans) == 't' || tolower(*trans) == 'c') && - std::is_same_v); + useFallback || !std::is_same_v; #endif #endif diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 304dd349bf..07d9476b66 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -824,6 +824,10 @@ struct kokkos_to_std_type_map { const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ const YViewType& Y) { \ + if (beta == Kokkos::ArithTraits::zero()) { \ + Kokkos::deep_copy(Y, Kokkos::ArithTraits::zero()); \ + } \ + \ bool row_major = std::is_same::value; \ const std::int64_t M = A.extent(0); \ const std::int64_t N = A.extent(1); \ From e7a4b0723857d56d3d15301456606760c3f17252 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 12 Jul 2024 10:38:47 -0400 Subject: [PATCH 298/326] Unify alignPtrTo implementation (#2275) --- common/src/KokkosKernels_Utils.hpp | 37 ++++++------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 89aeabb823..92419424b6 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -1527,41 +1527,18 @@ struct array_sum_reduce { } }; -/* Several alternatives were considered for SYCL, including - -unsigned int f1(unsigned int i, unsigned int align) -{ - return ((i + align - 1) / align * align); -} - -unsigned int f2(unsigned int i, unsigned int align) -{ - return (i + align - 1) & (-align); -} - -f1 should be equivalent to the below, but it produces incorrect results on SYCL -f2 is how GCC does std::align, but it also produces incorrect results on SYCL -possibly alignof(T) is not a power-of-2 on SYCL? Or a compiler error. -*/ -#if defined(KOKKOS_ENABLE_SYCL) template -KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) { - std::uintptr_t ptrVal = reinterpret_cast(p); - while (ptrVal % alignof(T)) { - ++ptrVal; - } - return reinterpret_cast(ptrVal); -} -#else -template -KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) { +KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr *p) { // ugly but computationally free and the "right" way to do this in C++ - std::uintptr_t ptrVal = reinterpret_cast(p); + const std::uintptr_t ptrVal = reinterpret_cast(p); // ptrVal + (align - 1) lands inside the next valid aligned scalar_t, // and the mask produces the start of that scalar_t. - return reinterpret_cast((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1))); + const std::uintptr_t ptrValNew = + (ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)); + return reinterpret_cast( + reinterpret_cast(const_cast *>(p)) + + (ptrValNew - ptrVal)); } -#endif } // namespace Impl } // namespace KokkosKernels From 3ce7adb7280cf190f15b205b11aac06dcb2d6181 Mon Sep 17 00:00:00 2001 From: Baptiste Legouix Date: Fri, 12 Jul 2024 16:40:09 +0200 Subject: [PATCH 299/326] init (#2273) --- sparse/impl/KokkosSparse_spmv_team_spec.hpp | 4 ++-- sparse/src/KokkosSparse_spmv_team.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_team_spec.hpp b/sparse/impl/KokkosSparse_spmv_team_spec.hpp index 156123b113..f065a34fb6 100644 --- a/sparse/impl/KokkosSparse_spmv_team_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_team_spec.hpp @@ -37,7 +37,7 @@ struct TeamSpmv { return Impl::TeamSpmvInternal::invoke< MemberType, ScalarType, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, dobeta>( - member, x.extent(0), alpha, values.data(), values.stride_0(), + member, y.extent(0), alpha, values.data(), values.stride_0(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); @@ -56,7 +56,7 @@ struct TeamVectorSpmv { return Impl::TeamVectorSpmvInternal::invoke< MemberType, ScalarType, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, dobeta>( - member, x.extent(0), alpha, values.data(), values.stride_0(), + member, y.extent(0), alpha, values.data(), values.stride_0(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); diff --git a/sparse/src/KokkosSparse_spmv_team.hpp b/sparse/src/KokkosSparse_spmv_team.hpp index 6c68478501..c3f2bfa49f 100644 --- a/sparse/src/KokkosSparse_spmv_team.hpp +++ b/sparse/src/KokkosSparse_spmv_team.hpp @@ -62,7 +62,7 @@ int KOKKOS_INLINE_FUNCTION team_spmv( return 1; } - if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + if ((x.extent(0) + 1) != row_ptr.extent(0)) { Kokkos::printf( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", @@ -116,7 +116,7 @@ int KOKKOS_INLINE_FUNCTION team_vector_spmv( return 1; } - if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + if ((x.extent(0) + 1) != row_ptr.extent(0)) { Kokkos::printf( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", From c93b6dc32f6eaa4fc8a13052fb638a30e6a34b49 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 16 Jul 2024 15:13:31 -0600 Subject: [PATCH 300/326] Bigger sptrsv cleanup (#2280) * Some cleanup and refactoring * Remove Upper/Lower TriLvlSchedTP2SolverFunctors * Remove Upper/Lower single block functors * Remove unused TriLvlSchedTP1SingleBlockFunctorDiagValues and merge upper/lower tri_solve_cg * Merge two big upper/lower branch of tri_solve_chain * Merge upper/lower tri_solve_streams * Switch over block spiluk precond test to use new block sptrsv --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 4 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 2646 +++-------------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 32 +- sparse/unit_test/Test_Sparse_sptrsv.hpp | 559 +--- 4 files changed, 585 insertions(+), 2656 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 019a63fcd7..0a4a75933e 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -305,7 +305,6 @@ void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #if (CUDA_VERSION >= 11030) typedef typename KernelHandle::nnz_lno_t idx_type; - typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::scalar_t scalar_type; typedef typename KernelHandle::memory_space memory_space; @@ -474,7 +473,6 @@ void sptrsvcuSPARSE_solve_streams( ) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE using idx_type = typename KernelHandle::nnz_lno_t; - using size_type = typename KernelHandle::size_type; using scalar_type = typename KernelHandle::nnz_scalar_t; using memory_space = typename KernelHandle::HandlePersistentMemorySpace; using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType; @@ -544,6 +542,8 @@ void sptrsvcuSPARSE_solve_streams( } } #else // CUDA_VERSION < 11030 + using size_type = typename KernelHandle::size_type; + const bool is_cuda_space = std::is_same::value || std::is_same::value || diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index d385a390cd..bc31f14791 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -35,8 +35,6 @@ #include "KokkosBatched_Trsm_Team_Impl.hpp" #endif -//#define SERIAL_FOR_LOOP - #define KOKKOSKERNELS_SPTRSV_TRILVLSCHED //#define KOKKOSPSTRSV_SOLVE_IMPL_PROFILE 1 @@ -100,11 +98,12 @@ struct SptrsvWrap { void operator()(const int) const {} }; - // This functor unifies the lower and upper implementations, the hope is the - // "is_lowertri" check does not add noticable time on larger problems + /** + * Common base class for sptrsv functors + */ template - struct TriLvlSchedTP1SolverFunctor { + class LHSType, class RHSType, bool BlockEnabled> + struct Common { RowMapType row_map; EntriesType entries; ValuesType values; @@ -112,538 +111,280 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - const bool is_lowertri; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - - TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const bool &is_lowertri_, - const long &node_count_) + Common(const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const size_type block_size_ = 0) : row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), - node_count(node_count_) {} + nodes_grouped_by_level(nodes_grouped_by_level_) { + KK_REQUIRE_MSG(!BlockEnabled, "Blocks are not yet supported."); + KK_REQUIRE_MSG(block_size_ == 0, "Blocks are not yet supported."); + } - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + struct ReduceSumFunctor { + const Common *m_obj; + const lno_t rowid; + lno_t diag; + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, scalar_t &accum) const { + const auto colid = m_obj->entries(i); + auto val = m_obj->values(i); + auto lhs_colid = m_obj->lhs(colid); + accum -= val * lhs_colid; + KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); + } + }; - team.team_barrier(); + struct ReduceSumDiagFunctor { + const Common *m_obj; + const lno_t rowid; + lno_t diag; - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, scalar_t &accum) const { + const auto colid = m_obj->entries(i); + if (colid != rowid) { + auto val = m_obj->values(i); + auto lhs_colid = m_obj->lhs(colid); + accum -= val * lhs_colid; + } else { + diag = i; + } } - } + }; KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } + static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val, + const scalar_t &diag_val) { + lhs_val = (lhs_val + rhs_val) / diag_val; } - }; - - template - struct TriLvlSchedTP1SolverFunctorDiagValues { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - ValuesType diagonal_values; // inserted according to rowid - - const bool is_lowertri; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long dense_nrows; - - TriLvlSchedTP1SolverFunctorDiagValues( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const ValuesType &diagonal_values_, const bool is_lowertri_, - long node_count_, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - diagonal_values(diagonal_values_), - is_lowertri(is_lowertri_), - node_count(node_count_), - dense_nrows(dense_nrows_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + template + KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, + const int my_rank, + const long node_count) const { + static_assert( + !((!IsSerial && BlockEnabled) && UseThreadVec), + "ThreadVectorRanges are not yet supported for block-enabled"); + static_assert(!(IsSerial && UseThreadVec), + "Requested thread vector range in serial?"); + + const auto rowid = nodes_grouped_by_level(my_rank + node_count); + const auto soffset = row_map(rowid); + const auto eoffset = row_map(rowid + 1); + const auto rhs_val = rhs(rowid); + scalar_t &lhs_val = lhs(rowid); + + // Set up range to auto-skip diag if is sorted + const auto itr_b = soffset + (IsSorted ? (IsLower ? 0 : 1) : 0); + const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); + + // We don't need the reducer to find the diag item if sorted + using reducer_t = + std::conditional_t; + reducer_t rf{this, rowid, -1}; + + if constexpr (IsSerial) { + KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); + KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); + for (auto ptr = itr_b; ptr < itr_e; ++ptr) { + rf(ptr, lhs_val); + } + } else { + KK_KERNEL_ASSERT_MSG(team != nullptr, + "Cannot do team operations without team"); + if constexpr (!UseThreadVec) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), + rf, lhs_val); + team->team_barrier(); + } else { + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); + } + } - team.team_barrier(); + // If sorted, we already know the diag. Otherwise, get it from the reducer + rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : - // (rhs_rowid+diff)/values(soffset); - lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid); + // At end, handle the diag element. We need to be careful to avoid race + // conditions here. + if constexpr (IsSerial) { + // Serial case is easy, there's only 1 thread so just do the + // add_and_divide + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + } else { + if constexpr (IsSorted) { + // Parallel sorted case is complex. All threads know what the diag is. + // If we have a team sharing the work, we need to ensure only one + // thread performs the add_and_divide. + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); + if constexpr (!UseThreadVec) { + Kokkos::single(Kokkos::PerTeam(*team), [&]() { + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + }); + } else { + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + } + } else { + // Parallel unsorted case. Only one thread should know what the diag + // item is. We have that one do the add_and_divide. + if (rf.diag != -1) { + add_and_divide(lhs_val, rhs_val, values(rf.diag)); + } + } } } }; template - struct TriLvlSchedTP2SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; + class LHSType, class RHSType, bool IsLower, bool BlockEnabled> + struct TriLvlSchedTP1SolverFunctor + : public Common { + using Base = Common; - const bool is_lowertri; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset - long node_groups; - long dense_nrows; - TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, + TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, - const bool is_lowertri_, long node_count_, - long node_groups_ = 0, long dense_nrows_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - is_lowertri(is_lowertri_), - node_count(node_count_), - node_groups(node_groups_), - dense_nrows(dense_nrows_) {} + const long &node_count_, + const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, + block_size_), + node_count(node_count_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = is_lowertri - ? (rhs_rowid + diff) / values(eoffset - 1) - : (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); + Base::template solve_impl(&team, team.league_rank(), + node_count); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); + Base::template solve_impl( + &team, team.league_rank(), node_count); } }; // Lower vs Upper Multi-block Functors template - struct LowerTriLvlSchedRPSolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} + class LHSType, class RHSType, bool IsLower, bool BlockEnabled> + struct TriLvlSchedRPSolverFunctor + : public Common { + using Base = Common; + + TriLvlSchedRPSolverFunctor(const RowMapType &row_map_, + const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, + const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, + const size_type block_size_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, + block_size_) {} KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - - for (long ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } - } // end for ptr + Base::template solve_impl(nullptr, 0, i); } KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - auto diag = -1; - - for (long ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - diag = ptr; - } - } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); + Base::template solve_impl(nullptr, 0, i); } }; template - struct LowerTriLvlSchedTP1SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; + class LHSType, class RHSType, bool IsLower> + struct TriLvlSchedTP1SingleBlockFunctor + : public Common { + using Base = + Common; + + entries_t nodes_per_level; long node_count; // like "block" offset into ngbl, my_league is the "local" // offset - long node_groups; - - LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), + long lvl_start; + long lvl_end; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + TriLvlSchedTP1SingleBlockFunctor( + const RowMapType &row_map_, const EntriesType &entries_, + const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, + const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, + long node_count_, long lvl_start_, long lvl_end_, + const int dense_nrows_ = 0, const int cutoff_ = 0) + : Base(row_map_, entries_, values_, lhs_, rhs_, + nodes_grouped_by_level_), + nodes_per_level(nodes_per_level_), node_count(node_count_), - node_groups(node_groups_) {} + lvl_start(lvl_start_), + lvl_end(lvl_end_), + dense_nrows(dense_nrows_), + cutoff(cutoff_) {} - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); + // SingleBlock: Only one block (or league) executing; team_rank used to map + // thread to row - team.team_barrier(); + template + KOKKOS_INLINE_FUNCTION void common_impl(const member_type &team) const { + auto mut_node_count = node_count; - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); + for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { + const auto nodes_this_lvl = nodes_per_level(lvl); + const auto my_team_rank = team.team_rank(); + const auto loop_cutoff = LargerCutoff ? cutoff : my_team_rank + 1; + // If cutoff > team_size, then a thread will be responsible for multiple + // rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < loop_cutoff; + my_rank += team.team_size()) { + if (my_rank < nodes_this_lvl) { + Base::template solve_impl( + &team, my_rank, mut_node_count); + } + } + mut_node_count += nodes_this_lvl; + team.team_barrier(); } } KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this; can also use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } + void operator()(const member_type &team) const { + common_impl(team); } - }; - - // FIXME CUDA: This algorithm not working with all integral type combos - // In any case, this serves as a skeleton for 3-level hierarchical parallelism - // for alg dev - template - struct LowerTriLvlSchedTP2SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); + void operator()(const UnsortedTag &, const member_type &team) const { + common_impl(team); } KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange + void operator()(const LargerCutoffTag &, const member_type &team) const { + common_impl(team); + } - team.team_barrier(); + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedLargerCutoffTag &, + const member_type &team) const { + common_impl(team); } }; @@ -1217,1396 +958,54 @@ struct SptrsvWrap { KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, - Xjj); - } - } - team.team_barrier(); - } - if (nsrow2 > 0) { - /* GEMM to update off diagonal blocks, Z = Uij * Xj */ - auto Z = Kokkos::subview( - work, range_type(workoffset + nscol, workoffset + nsrow)); - if (!invert_offdiagonal && diag_kernel_type(level) != 3) { - // not device-level TRSM-solve - auto Uij = - Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); - KokkosBlas::TeamGemv::invoke(team, - one, - Uij, - Xj, - zero, - Z); - team.team_barrier(); - } - - /* scatter vector into Z */ - int i2 = i1 + nscol; // offset into rowind - Kokkos::View> - Xatomic(X.data(), X.extent(0)); - for (int ii = team_rank; ii < nsrow2; ii += team_size) { - int i = rowind(i2 + ii); - Xatomic(i) -= Z(ii); - } - team.team_barrier(); - } - } - }; -#endif - - template - struct UpperTriLvlSchedRPSolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - // Assuming indices are sorted per row, diag entry is final index in the - // list - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - lhs(rowid) = rhs_rowid / val; - } - } // end for ptr - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const lno_t i) const { - auto rowid = nodes_grouped_by_level(i); - long soffset = row_map(rowid); - long eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - auto diag = -1; - for (long ptr = eoffset - 1; ptr >= soffset; --ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - rhs_rowid = rhs_rowid - val * lhs(colid); - } else { - diag = ptr; - } - } // end for ptr - lhs(rowid) = rhs_rowid / values(diag); - } - }; - - template - struct UpperTriLvlSchedTP1SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_rank = team.team_rank(); - - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - team.team_barrier(); - - // At end, finalize rowid == colid - // only one thread should do this, also can use Kokkos::single - if (my_rank == 0) { - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } - } - }; - - // FIXME CUDA: This algorithm not working with all integral type combos - // In any case, this serves as a skeleton for 3-level hierarchical parallelism - // for alg dev - template - struct UpperTriLvlSchedTP2SolverFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long node_groups; - - UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, - const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, - const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - long node_count_, long node_groups_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - node_count(node_count_), - node_groups(node_groups_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } - }, - diff); - - // ASSUMPTION: sorted diagonal value located at start offset - lhs(rowid) = (rhs_rowid + diff) / values(soffset); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - - size_t nrows = row_map.extent(0) - 1; - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) { - auto rowid = nodes_grouped_by_level(node_count + - my_league * node_groups + ng); - if (size_t(rowid) < nrows) { - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); - auto rhs_rowid = rhs(rowid); - scalar_t diff = scalar_t(0.0); - - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, soffset, eoffset), - [&](const long ptr, scalar_t &tdiff) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff = tdiff - val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); - - lhs(rowid) = (rhs_rowid + diff) / values(diag); - } // end if - }); // end TeamThreadRange - - team.team_barrier(); - } - }; - - // -------------------------------- - // Single-block functors - // -------------------------------- - - template - struct LowerTriLvlSchedTP1SingleBlockFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - LowerTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for - // lower tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - }; - - template - struct UpperTriLvlSchedTP1SingleBlockFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - long cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - UpperTriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at soffset - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl each thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for - // lower tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - }; - - template - struct TriLvlSchedTP1SingleBlockFunctor { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const bool is_lowertri; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctor( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_, - long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, - const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - is_lowertri(is_lower_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - if (is_lowertri) - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - else - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for - // lower tri, soffset for upper tri - if (is_lowertri) - lhs(rowid) = (rhs_val + diff) / values(eoffset - 1); - else - lhs(rowid) = (rhs_val + diff) / values(soffset); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - - KOKKOS_INLINE_FUNCTION - void operator()(const UnsortedLargerCutoffTag &, - const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } else { - diag = ptr; - } - } -#else - auto trange = eoffset - soffset; - auto diag = -1; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } else { - diag = ptr; - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / values(diag); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end tagged operator - }; - - template - struct TriLvlSchedTP1SingleBlockFunctorDiagValues { - RowMapType row_map; - EntriesType entries; - ValuesType values; - LHSType lhs; - RHSType rhs; - entries_t nodes_grouped_by_level; - entries_t nodes_per_level; - ValuesType diagonal_values; - - long node_count; // like "block" offset into ngbl, my_league is the "local" - // offset - long lvl_start; - long lvl_end; - const bool is_lowertri; - const int dense_nrows; - const int cutoff; - // team_size: each team can be assigned a row, if there are enough rows... - - TriLvlSchedTP1SingleBlockFunctorDiagValues( - const RowMapType &row_map_, const EntriesType &entries_, - const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - const entries_t &nodes_grouped_by_level_, - const entries_t &nodes_per_level_, const ValuesType &diagonal_values_, - long node_count_, const long lvl_start_, const long lvl_end_, - const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0) - : row_map(row_map_), - entries(entries_), - values(values_), - lhs(lhs_), - rhs(rhs_), - nodes_grouped_by_level(nodes_grouped_by_level_), - nodes_per_level(nodes_per_level_), - diagonal_values(diagonal_values_), - node_count(node_count_), - lvl_start(lvl_start_), - lvl_end(lvl_end_), - is_lowertri(is_lower_), - dense_nrows(dense_nrows_), - cutoff(cutoff_) {} - - // SingleBlock: Only one block (or league) executing; team_rank used to map - // thread to row - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_rank = team.team_rank(); - diff = scalar_t(0.0); - - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower - // tri, soffset for upper tri - lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); - } // end if team.team_rank() < nodes_this_lvl - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; - } - team.team_barrier(); - } // end for lvl - } // end operator - - KOKKOS_INLINE_FUNCTION - void operator()(const LargerCutoffTag &, const member_type &team) const { - long mut_node_count = node_count; - typename entries_t::non_const_value_type rowid{0}; - typename RowMapType::non_const_value_type soffset{0}; - typename RowMapType::non_const_value_type eoffset{0}; - typename RHSType::non_const_value_type rhs_val{0}; - scalar_t diff = scalar_t(0.0); - - for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) { - auto nodes_this_lvl = nodes_per_level(lvl); - int my_team_rank = team.team_rank(); - // If cutoff > team_size, then a thread will be responsible for multiple - // rows - this may be a helpful scenario depending on occupancy etc. - for (int my_rank = my_team_rank; my_rank < cutoff; - my_rank += team.team_size()) { - diff = scalar_t(0.0); - if (my_rank < nodes_this_lvl) { - // THIS is where the mapping of threadid to rowid happens - rowid = nodes_grouped_by_level(my_rank + mut_node_count); - soffset = row_map(rowid); - eoffset = row_map(rowid + 1); - rhs_val = rhs(rowid); - -#ifdef SERIAL_FOR_LOOP - for (auto ptr = soffset; ptr < eoffset; ++ptr) { - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - diff -= val * lhs(colid); - } - } -#else - auto trange = eoffset - soffset; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, trange), - [&](const int loffset, scalar_t &tdiff) { - auto ptr = soffset + loffset; - auto colid = entries(ptr); - auto val = values(ptr); - if (colid != rowid) { - tdiff -= val * lhs(colid); - } - }, - diff); -#endif - lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid); - } // end if team.team_rank() < nodes_this_lvl - } // end for my_rank loop - { - // Update mut_node_count from nodes_per_level(lvl) each iteration of - // lvl per thread - mut_node_count += nodes_this_lvl; + Xjj); + } } team.team_barrier(); - } // end for lvl - } // end tagged operator - }; - -#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT - template - static void lower_tri_solve_cg(TriSolveHandle &thandle, - const RowMapType row_map, - const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { - typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = - thandle.get_sptrsvCudaGraph(); - - auto nlevels = thandle.get_num_levels(); - - auto stream1 = lcl_cudagraph->stream; - Kokkos::Cuda cuda1(stream1); - auto graph = lcl_cudagraph->cudagraph; - - Kokkos::parallel_for("Init", Kokkos::RangePolicy(0, 1), - EmptyFunctor()); - Kokkos::Cuda().fence(); - cudaStreamSynchronize(stream1); - // Kokkos::fence(); - - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - - size_type node_count = 0; - - int team_size = thandle.get_team_size(); - team_size = team_size == -1 ? 64 : team_size; - - // Start capturing stream - if (thandle.cudagraphCreated == false) { - Kokkos::fence(); - cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); - { - for (int iter = 0; iter < nlevels; ++iter) { - size_type lvl_nodes = hnodes_per_level(iter); - - auto policy = std::is_same::value - ? team_policy(lvl_nodes, team_size, cuda1) - : team_policy(lvl_nodes, team_size); - - Kokkos::parallel_for( - "parfor_l_team_cudagraph", - Kokkos::Experimental::require( - policy, - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - LowerTriLvlSchedTP1SolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count)); + } + if (nsrow2 > 0) { + /* GEMM to update off diagonal blocks, Z = Uij * Xj */ + auto Z = Kokkos::subview( + work, range_type(workoffset + nscol, workoffset + nsrow)); + if (!invert_offdiagonal && diag_kernel_type(level) != 3) { + // not device-level TRSM-solve + auto Uij = + Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); + KokkosBlas::TeamGemv::invoke(team, + one, + Uij, + Xj, + zero, + Z); + team.team_barrier(); + } - node_count += hnodes_per_level(iter); + /* scatter vector into Z */ + int i2 = i1 + nscol; // offset into rowind + Kokkos::View> + Xatomic(X.data(), X.extent(0)); + for (int ii = team_rank; ii < nsrow2; ii += team_size) { + int i = rowind(i2 + ii); + Xatomic(i) -= Z(ii); } + team.team_barrier(); } - cudaStreamEndCapture(stream1, &graph); - - // Create graphExec - cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, - NULL, 0); - thandle.cudagraphCreated = true; } - // Run graph - Kokkos::fence(); - cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); + }; +#endif - cudaStreamSynchronize(stream1); - Kokkos::fence(); - } // end lower_tri_solve_cg + // + // End of functors, begin external API + // - template - static void upper_tri_solve_cg(TriSolveHandle &thandle, - const RowMapType row_map, - const EntriesType entries, - const ValuesType values, const RHSType &rhs, - LHSType &lhs) { + static void tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, + const EntriesType entries, const ValuesType values, + const RHSType &rhs, LHSType &lhs) { typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph = thandle.get_sptrsvCudaGraph(); @@ -2642,12 +1041,12 @@ struct SptrsvWrap { : team_policy(lvl_nodes, team_size); Kokkos::parallel_for( - "parfor_u_team_cudagraph", + "parfor_l_team_cudagraph", Kokkos::Experimental::require( policy, Kokkos::Experimental::WorkItemProperty::HintLightWeight), - UpperTriLvlSchedTP1SolverFunctor( + TriLvlSchedTP1SolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); @@ -2667,10 +1066,14 @@ struct SptrsvWrap { cudaStreamSynchronize(stream1); Kokkos::fence(); - } // end upper_tri_solve_cg + } // end tri_solve_cg #endif +#define FunctorTypeMacro(Functor, IsLower, BlockEnabled) \ + Functor + template static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle, @@ -2681,13 +1084,22 @@ struct SptrsvWrap { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - auto nlevels = thandle.get_num_levels(); + const auto nlevels = thandle.get_num_levels(); // Keep this a host View, create device version and copy to back to host // during scheduling This requires making sure the host view in the handle // is properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + const auto nodes_per_level = thandle.get_nodes_per_level(); + const auto hnodes_per_level = thandle.get_host_nodes_per_level(); + const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + const auto block_size = thandle.get_block_size(); + const auto block_enabled = false; // thandle.is_block_enabled(); + assert(block_size == 0); + + // Set up functor types + using LowerRPFunc = + FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, false); + using LowerTPFunc = + FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, false); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; @@ -2753,44 +1165,29 @@ struct SptrsvWrap { #endif if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + LowerRPFunc lrpp(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, block_size); + Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::Experimental::require( range_policy(space, node_count, node_count + lvl_nodes), Kokkos::Experimental::WorkItemProperty::HintLightWeight), - LowerTriLvlSchedRPSolverFunctor( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); + lrpp); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { + LowerTPFunc ltpp(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); - -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - true, node_count); -#else - LowerTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + auto tp = team_size == -1 + ? team_policy(space, lvl_nodes, Kokkos::AUTO) + : team_policy(space, lvl_nodes, team_size); + Kokkos::parallel_for( + "parfor_l_team", + Kokkos::Experimental::require( + tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), + ltpp); } // TP2 algorithm has issues with some offset-ordinal combo to be // addressed @@ -2837,6 +1234,8 @@ struct SptrsvWrap { else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree size_t flops = 0; @@ -2985,6 +1384,8 @@ struct SptrsvWrap { SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree Kokkos::Timer timer; timer.reset(); @@ -3067,7 +1468,6 @@ struct SptrsvWrap { std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl; #endif - } // end lower_tri_solve template ( - row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); + urpp); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { + UpperTPFunc utpp(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, node_count, block_size); int team_size = thandle.get_team_size(); - -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - false, node_count); -#else - UpperTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - else - Kokkos::parallel_for( - "parfor_u_team", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); + auto tp = team_size == -1 + ? team_policy(space, lvl_nodes, Kokkos::AUTO) + : team_policy(space, lvl_nodes, team_size); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight), + utpp); } // TP2 algorithm has issues with some offset-ordinal combo to be // addressed @@ -3240,6 +1630,8 @@ tstf); } // end elseif else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); #ifdef profile_supernodal_etree size_t flops = 0; @@ -3493,6 +1885,9 @@ tstf); } // end elseif SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { + KK_REQUIRE_MSG(!block_enabled, + "Block matrices not yet supported for supernodal"); + #ifdef profile_supernodal_etree Kokkos::Timer timer; timer.reset(); @@ -3603,13 +1998,13 @@ tstf); } // end elseif } // end upper_tri_solve - template static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType &rhs, - LHSType &lhs, const bool /*is_lowertri_*/) { + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif @@ -3625,28 +2020,14 @@ tstf); } // end elseif auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - const bool is_lowertri = thandle.is_lower_tri(); - size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines using large_cutoff_policy_type = Kokkos::TeamPolicy; - /* - using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = - LowerTriLvlSchedTP1SolverFunctor; using UTP1Functor = - UpperTriLvlSchedTP1SolverFunctor; using LSingleBlockFunctor = - LowerTriLvlSchedTP1SingleBlockFunctor; using USingleBlockFunctor = - UpperTriLvlSchedTP1SingleBlockFunctor; - */ using SingleBlockFunctor = TriLvlSchedTP1SingleBlockFunctor; + LHSType, RHSType, IsLower>; int team_size = thandle.get_team_size(); int vector_size = @@ -3670,315 +2051,105 @@ tstf); } // end elseif // team_size_singleblock is unimportant } - // This is only necessary for Lower,UpperTri functor versions; else, - // is_lowertri can be passed as arg to the generic Tri functor... - if (is_lowertri) { - for (size_type chainlink = 0; chainlink < num_chain_entries; - ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); - - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - true, node_count); -#else - LowerTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); -#endif - if (team_size == -1) { - team_size = - team_policy(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } - - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for( - "parfor_l_team_chain1", - Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - node_count += lvl_nodes; - - } else { - size_type lvl_nodes = 0; + for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink + 1); + + if (echain - schain == 1) { + // if team_size is -1 (unset), get recommended size from Kokkos + TriLvlSchedTP1SolverFunctor + tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, + node_count); + if (team_size == -1) { + team_size = + team_policy(space, 1, 1, vector_size) + .team_size_recommended(tstf, Kokkos::ParallelForTag()); + } - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); - } + size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? + Kokkos::parallel_for( + "parfor_l_team_chain1", + Kokkos::Experimental::require( + team_policy(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); + node_count += lvl_nodes; - if (team_size_singleblock <= 0) { - team_size_singleblock = - team_policy(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), - Kokkos::ParallelForTag()); - } + } else { + size_type lvl_nodes = 0; - if (cutoff <= team_size_singleblock) { -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true); -#else - LowerTriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); -#endif - Kokkos::parallel_for( - "parfor_l_team_chainmulti", - Kokkos::Experimental::require( - team_policy(space, 1, team_size_singleblock, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, true, 0, - cutoff); -#else - LowerTriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); -#endif - Kokkos::parallel_for( - "parfor_l_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } - node_count += lvl_nodes; + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); } - // TODO: space.fence() - Kokkos::fence(); // TODO - is this necessary? that is, can the - // parallel_for launch before the s/echain values have - // been updated? - } - - } else { - for (size_type chainlink = 0; chainlink < num_chain_entries; - ++chainlink) { - size_type schain = h_chain_ptr(chainlink); - size_type echain = h_chain_ptr(chainlink + 1); - if (echain - schain == 1) { - // if team_size is -1 (unset), get recommended size from Kokkos -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - is_lowertri, node_count); -#else - UpperTriLvlSchedTP1SolverFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - node_count); -#endif - if (team_size == -1) { - team_size = - team_policy(space, 1, 1, vector_size) - .team_size_recommended(tstf, Kokkos::ParallelForTag()); - } + if (team_size_singleblock <= 0) { + team_size_singleblock = + team_policy(space, 1, 1, vector_size) + .team_size_recommended( + SingleBlockFunctor(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, + nodes_per_level, node_count, schain, + echain), + Kokkos::ParallelForTag()); + } - // TODO To use cudagraph here, need to know how many non-unit chains - // there are, create a graph for each and launch accordingly - size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? + if (cutoff <= team_size_singleblock) { + SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, nodes_per_level, + node_count, schain, echain); Kokkos::parallel_for( - "parfor_u_team_chain1", + "parfor_l_team_chainmulti", Kokkos::Experimental::require( - team_policy(space, lvl_nodes, team_size, vector_size), + team_policy(space, 1, team_size_singleblock, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); - node_count += lvl_nodes; - } else { - size_type lvl_nodes = 0; - - for (size_type i = schain; i < echain; ++i) { - lvl_nodes += hnodes_per_level(i); - } - - if (team_size_singleblock <= 0) { - // team_size_singleblock = team_policy(1, 1, - // 1).team_size_recommended(SingleBlockFunctor(row_map, entries, - // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, - // node_count), Kokkos::ParallelForTag()); - team_size_singleblock = - team_policy(space, 1, 1, vector_size) - .team_size_recommended( - SingleBlockFunctor(row_map, entries, values, lhs, rhs, - nodes_grouped_by_level, - nodes_per_level, node_count, schain, - echain, is_lowertri), - Kokkos::ParallelForTag()); - } - - if (cutoff <= team_size_singleblock) { -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri); -#else - UpperTriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain); -#endif - Kokkos::parallel_for( - "parfor_u_team_chainmulti", - Kokkos::Experimental::require( - team_policy(space, 1, team_size_singleblock, vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } else { - // team_size_singleblock < cutoff => kernel must allow for a - // block-stride internally -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, is_lowertri, - 0, cutoff); -#else - UpperTriLvlSchedTP1SingleBlockFunctor - tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - nodes_per_level, node_count, schain, echain, cutoff); -#endif - Kokkos::parallel_for( - "parfor_u_team_chainmulti_cutoff", - Kokkos::Experimental::require( - large_cutoff_policy_type(1, team_size_singleblock, - vector_size), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - tstf); - } - node_count += lvl_nodes; + // team_size_singleblock < cutoff => kernel must allow for a + // block-stride internally + SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, + nodes_grouped_by_level, nodes_per_level, + node_count, schain, echain, 0, cutoff); + Kokkos::parallel_for( + "parfor_l_team_chainmulti_cutoff", + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? } + // TODO: space.fence() + Kokkos::fence(); // TODO - is this necessary? that is, can the + // parallel_for launch before the s/echain values have + // been updated? } } // end tri_solve_chain // -------------------------------- // Stream interfaces // -------------------------------- - template - static void lower_tri_solve_streams( - const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, std::vector &lhs_v) { - // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment - using nodes_per_level_type = - typename TriSolveHandle::hostspace_nnz_lno_view_t; - using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector hnodes_per_level_v(nstreams); - std::vector nodes_grouped_by_level_v(nstreams); - std::vector node_count_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); - nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; - } - - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // 1. Launch work on all streams - for (int i = 0; i < nstreams; i++) { - // Only if stream i-th still has this level - if (lvl < nlevels_v[i]) { - size_type lvl_nodes = hnodes_per_level_v[i](lvl); - if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - range_policy(execspace_v[i], node_count_v[i], - node_count_v[i] + lvl_nodes), - LowerTriLvlSchedRPSolverFunctor( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == - KokkosSparse::Experimental::SPTRSVAlgorithm:: - SEQLVLSCHD_TP1) { - int team_size = thandle_v[i]->get_team_size(); -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], true, - node_count_v[i]); -#else - LowerTriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, team_size), tstf); - } - node_count_v[i] += lvl_nodes; - } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams - } // end for lvl - } // end lower_tri_solve_streams - - template - static void upper_tri_solve_streams( - const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &row_map_v, - const std::vector &entries_v, - const std::vector &values_v, - const std::vector &rhs_v, std::vector &lhs_v) { + static void tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + using RPPointFunctor = + FunctorTypeMacro(TriLvlSchedRPSolverFunctor, IsLower, false); + using TPPointFunctor = + FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, IsLower, false); // Create vectors for handles' data in streams int nstreams = execspace_v.size(); @@ -4011,41 +2182,28 @@ tstf); } // end elseif "parfor_fixed_lvl", range_policy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), - UpperTriLvlSchedRPSolverFunctor( - row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i])); + RPPointFunctor(row_map_v[i], entries_v[i], values_v[i], + lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i])); } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { int team_size = thandle_v[i]->get_team_size(); -#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], false, - node_count_v[i]); -#else - UpperTriLvlSchedTP1SolverFunctor - tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], - rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); -#endif - if (team_size == -1) - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for( - "parfor_l_team", - team_policy(execspace_v[i], lvl_nodes, team_size), tstf); + auto tp = + team_size == -1 + ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO) + : team_policy(execspace_v[i], lvl_nodes, team_size); + TPPointFunctor tstf(row_map_v[i], entries_v[i], values_v[i], + lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i], node_count_v[i]); + Kokkos::parallel_for("parfor_l_team", tp, tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) } // end if (lvl < nlevels_v[i]) } // end for streams } // end for lvl - } // end upper_tri_solve_streams + } // end tri_solve_streams }; // struct SptrsvWrap diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index d69c499c60..b2c57b1dfa 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -135,19 +135,19 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, - b, x, true); + Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, + entries, values, b, x); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, + Sptrsv::tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Sptrsv::lower_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map, + entries, values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { @@ -156,19 +156,19 @@ struct SPTRSV_SOLVEget_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values, - b, x, false); + Sptrsv::template tri_solve_chain(space, *sptrsv_handle, row_map, + entries, values, b, x); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) // TODO: set stream in thandle's sptrsvCudaGraph - Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, - b, x); + Sptrsv::tri_solve_cg(*sptrsv_handle, row_map, entries, values, + b, x); else #endif - Sptrsv::upper_tri_solve(space, *sptrsv_handle, row_map, entries, - values, b, x); + Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map, + entries, values, b, x); } } Kokkos::Profiling::popRegion(); @@ -202,8 +202,9 @@ struct SPTRSV_SOLVE(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, + b_v, x_v); } else { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { @@ -212,8 +213,9 @@ struct SPTRSV_SOLVE(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, + b_v, x_v); } Kokkos::Profiling::popRegion(); } diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index b8b35bc422..8beff14592 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -47,15 +47,12 @@ template struct SptrsvTest { // Define useful types - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using Crs = CrsMatrix; @@ -65,6 +62,9 @@ struct SptrsvTest { using range_policy_t = Kokkos::RangePolicy; + static inline const scalar_t ZERO = scalar_t(0); + static inline const scalar_t ONE = scalar_t(1); + static std::vector> get_5x5_ut_ones_fixture() { std::vector> A = {{1.00, 0.00, 1.00, 0.00, 0.00}, {0.00, 1.00, 0.00, 0.00, 1.00}, @@ -103,6 +103,17 @@ struct SptrsvTest { return A; } + static bool do_cusparse() { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + return ( + std::is_same::value && + std::is_same::value && + std::is_same::value); +#else + return false; +#endif + } + struct ReductionCheck { ValuesType lhs; @@ -112,12 +123,83 @@ struct SptrsvTest { void operator()(lno_t i, scalar_t &tsum) const { tsum += lhs(i); } }; - static void run_test_sptrsv() { - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); + static std::tuple create_crs_lhs_rhs( + const std::vector> &fixture) { + RowMapType row_map; + EntriesType entries; + ValuesType values; + + compress_matrix(row_map, entries, values, fixture); + const auto nrows = row_map.size() - 1; + const auto nnz = values.size(); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + ValuesType lhs("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); + + Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + + return std::make_tuple(triMtx, lhs, rhs); + } + + template + static void basic_check(const SpMatrix &triMtx, const ValuesType &lhs, + const ValuesType &rhs, const bool is_lower, + const size_type block_size = 0) { + // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2, + // currently unavailable + std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, + SPTRSVAlgorithm::SEQLVLSCHD_TP1}; + if (block_size == 0) { + // SEQLVLSCHD_TP1CHAIN and SPTRSV_CUSPARSE are not supported for blocks + algs.push_back(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN); + if (do_cusparse()) { + algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE); + } + } + + auto row_map = triMtx.graph.row_map; + auto entries = triMtx.graph.entries; + auto values = triMtx.values; + + const size_type nrows = row_map.size() - 1; + + for (auto alg : algs) { + KernelHandle kh; + kh.create_sptrsv_handle(alg, nrows, is_lower, block_size); + if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + } + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + Kokkos::deep_copy(lhs, ZERO); + + kh.destroy_sptrsv_handle(); + } + } + + static void run_test_sptrsv() { const size_type nrows = 5; - const size_type nnz = 10; #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using host_crsmat_t = @@ -142,121 +224,13 @@ struct SptrsvTest { // Upper tri { - RowMapType row_map; - EntriesType entries; - ValuesType values; - - auto fixture = get_5x5_ut_ones_fixture(); - - compress_matrix(row_map, entries, values, fixture); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - - Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - { - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); + const auto [triMtx, lhs, rhs] = + create_crs_lhs_rhs(get_5x5_ut_ones_fixture()); - sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0) ); - */ - - kh.destroy_sptrsv_handle(); + basic_check(triMtx, lhs, rhs, false); } - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); - } - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); - } -#endif - #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) const scalar_t FIVE = scalar_t(5); const size_type nnz_sp = 14; @@ -367,120 +341,13 @@ struct SptrsvTest { // Lower tri { - auto fixture = get_5x5_lt_ones_fixture(); - RowMapType row_map; - EntriesType entries; - ValuesType values; - - compress_matrix(row_map, entries, values, fixture); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - - Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - { - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); + const auto [triMtx, lhs, rhs] = + create_crs_lhs_rhs(get_5x5_lt_ones_fixture()); - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ( sum, lhs.extent(0) ); - */ - - kh.destroy_sptrsv_handle(); + basic_check(triMtx, lhs, rhs, true); } - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); - } - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), - ReductionCheck(lhs), sum); - EXPECT_EQ(sum, lhs.extent(0)); - - kh.destroy_sptrsv_handle(); - } -#endif - #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) { // L in csc @@ -558,7 +425,6 @@ struct SptrsvTest { scalar_t sum = 0.0; Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), ReductionCheck(X), sum); - EXPECT_EQ(sum, lhs.extent(0)); EXPECT_EQ(sum, X.extent(0)); khL.destroy_sptrsv_handle(); @@ -619,7 +485,6 @@ struct SptrsvTest { scalar_t sum = 0.0; Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), ReductionCheck(X), sum); - EXPECT_EQ(sum, lhs.extent(0)); EXPECT_EQ(sum, X.extent(0)); khLd.destroy_sptrsv_handle(); @@ -629,7 +494,8 @@ struct SptrsvTest { } } - static void run_test_sptrsv_streams(int test_algo, int nstreams) { + static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams, + const bool is_lower) { // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition bool run_streams_test = true; @@ -645,9 +511,6 @@ struct SptrsvTest { #endif if (!run_streams_test) return; - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - const size_type nrows = 5; const size_type nnz = 10; @@ -662,150 +525,63 @@ struct SptrsvTest { std::vector rhs_v(nstreams); std::vector lhs_v(nstreams); - RowMapType_hostmirror hrow_map; - EntriesType_hostmirror hentries; - ValuesType_hostmirror hvalues; - - // Upper tri - { - auto fixture = get_5x5_ut_ones_fixture(); - compress_matrix(hrow_map, hentries, hvalues, fixture); - - for (int i = 0; i < nstreams; i++) { - // Allocate U - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); - - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = false; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - EXPECT_EQ(sum, lhs_v[i].extent(0)); - - kh_v[i].destroy_sptrsv_handle(); - } - } - - // Lower tri - { - auto fixture = get_5x5_lt_ones_fixture(); - compress_matrix(hrow_map, hentries, hvalues, fixture); + auto fixture = + is_lower ? get_5x5_lt_ones_fixture() : get_5x5_ut_ones_fixture(); + const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(fixture); - for (int i = 0; i < nstreams; i++) { - // Allocate L - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); + auto row_map = triMtx.graph.row_map; + auto entries = triMtx.graph.entries; + auto values = triMtx.values; - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); + for (int i = 0; i < nstreams; i++) { + // Allocate + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + // Copy + Kokkos::deep_copy(row_map_v[i], row_map); + Kokkos::deep_copy(entries_v[i], entries); + Kokkos::deep_copy(values_v[i], values); - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = true; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - EXPECT_EQ(sum, lhs_v[i].extent(0)); + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); - kh_v[i].destroy_sptrsv_handle(); - } + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + kh_v[i].create_sptrsv_handle(test_algo, nrows, is_lower); + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + EXPECT_EQ(sum, lhs_v[i].extent(0)); + kh_v[i].destroy_sptrsv_handle(); } } }; @@ -823,25 +599,18 @@ template void test_sptrsv_streams() { using TestStruct = Test::SptrsvTest; + std::vector algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP, + SPTRSVAlgorithm::SEQLVLSCHD_TP1}; + if (TestStruct::do_cusparse()) { + algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE); + } - TestStruct::run_test_sptrsv_streams(0, 1); - TestStruct::run_test_sptrsv_streams(0, 2); - TestStruct::run_test_sptrsv_streams(0, 3); - TestStruct::run_test_sptrsv_streams(0, 4); - TestStruct::run_test_sptrsv_streams(1, 1); - TestStruct::run_test_sptrsv_streams(1, 2); - TestStruct::run_test_sptrsv_streams(1, 3); - TestStruct::run_test_sptrsv_streams(1, 4); - -#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - if (std::is_same::value && - std::is_same::value) { - TestStruct::run_test_sptrsv_streams(2, 1); - TestStruct::run_test_sptrsv_streams(2, 2); - TestStruct::run_test_sptrsv_streams(2, 3); - TestStruct::run_test_sptrsv_streams(2, 4); + for (auto alg : algs) { + for (int nstreams = 1; nstreams <= 4; ++nstreams) { + TestStruct::run_test_sptrsv_streams(alg, nstreams, true); + TestStruct::run_test_sptrsv_streams(alg, nstreams, false); + } } -#endif } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 6e6d6df6a431fc560da8acf16c42685d5afe6cb8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 20 Jul 2024 22:07:03 +0900 Subject: [PATCH 301/326] Bump actions/dependency-review-action from 4.3.3 to 4.3.4 (#2279) Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 4.3.3 to 4.3.4. - [Release notes](https://github.com/actions/dependency-review-action/releases) - [Commits](https://github.com/actions/dependency-review-action/compare/72eb03d02c7872a771aacd928f3123ac62ad6d3a...5a2ce3f5b92ee19cbb1541a4984c76d921601d7c) --- updated-dependencies: - dependency-name: actions/dependency-review-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dependency-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 1792f0181c..56d5770ba5 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -24,4 +24,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: 'Dependency Review' - uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3 + uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4 From 8ffebaaa4f974ed7baf255f3bffe789d0af29115 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 20 Jul 2024 22:07:31 +0900 Subject: [PATCH 302/326] Bump github/codeql-action from 3.25.11 to 3.25.12 (#2278) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.11 to 3.25.12. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/b611370bb5703a7efb587f9d136a52ea24c5c38c...4fa2a7953630fd2f3fb380f21be14ede0169dd4f) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecards.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index e1f8aa51f8..49ce377c26 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -44,7 +44,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 + uses: github/codeql-action/init@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 + uses: github/codeql-action/analyze@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 with: category: "/language:c-cpp" diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index bf06213b40..d81ed0e1bc 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 + uses: github/codeql-action/upload-sarif@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 with: sarif_file: results.sarif From 985c3a9e7343c2f612560024cae4d968f800c8ac Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sat, 20 Jul 2024 07:10:45 -0600 Subject: [PATCH 303/326] Propose increasing column limit to 120. (#2255) * Change key files * Full reformat * Update format.yml * Update ubuntu version for format checker --- .clang-format | 1 + .git-blame-ignore-revs | 17 + .github/workflows/format.yml | 6 +- batched/KokkosBatched_Util.hpp | 276 +- .../impl/KokkosBatched_AddRadial_Impl.hpp | 17 +- .../impl/KokkosBatched_AddRadial_Internal.hpp | 9 +- ...kosBatched_ApplyGivens_Serial_Internal.hpp | 42 +- ...osBatched_ApplyHouseholder_Serial_Impl.hpp | 30 +- ...tched_ApplyHouseholder_Serial_Internal.hpp | 20 +- ...tched_ApplyHouseholder_TeamVector_Impl.hpp | 34 +- ...d_ApplyHouseholder_TeamVector_Internal.hpp | 76 +- .../impl/KokkosBatched_ApplyPivot_Impl.hpp | 88 +- .../KokkosBatched_ApplyPivot_Internal.hpp | 86 +- .../impl/KokkosBatched_ApplyQ_Serial_Impl.hpp | 48 +- .../KokkosBatched_ApplyQ_Serial_Internal.hpp | 42 +- .../KokkosBatched_ApplyQ_TeamVector_Impl.hpp | 60 +- ...kkosBatched_ApplyQ_TeamVector_Internal.hpp | 48 +- .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 238 +- .../dense/impl/KokkosBatched_Copy_Impl.hpp | 184 +- .../impl/KokkosBatched_Copy_Internal.hpp | 80 +- .../dense/impl/KokkosBatched_Dot_Internal.hpp | 277 +- ...Batched_Eigendecomposition_Serial_Impl.hpp | 36 +- ...hed_Eigendecomposition_Serial_Internal.hpp | 47 +- ...hed_Eigendecomposition_TeamVector_Impl.hpp | 41 +- ...Eigendecomposition_TeamVector_Internal.hpp | 22 +- ...kkosBatched_Eigenvalue_Serial_Internal.hpp | 38 +- .../impl/KokkosBatched_FindAmax_Internal.hpp | 11 +- .../KokkosBatched_Francis_Serial_Internal.hpp | 55 +- .../impl/KokkosBatched_Gemm_Serial_Impl.hpp | 268 +- .../KokkosBatched_Gemm_Serial_Internal.hpp | 35 +- .../KokkosBatched_Gemm_TeamVector_Impl.hpp | 64 +- ...KokkosBatched_Gemm_TeamVector_Internal.hpp | 72 +- .../impl/KokkosBatched_Gemm_Team_Impl.hpp | 144 +- .../impl/KokkosBatched_Gemm_Team_Internal.hpp | 83 +- .../KokkosBatched_Gemv_TeamVector_Impl.hpp | 63 +- ...KokkosBatched_Gemv_TeamVector_Internal.hpp | 69 +- .../impl/KokkosBatched_Gemv_Team_Impl.hpp | 71 +- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 67 +- .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 343 +- .../KokkosBatched_Givens_Serial_Internal.hpp | 11 +- .../KokkosBatched_HadamardProduct_Impl.hpp | 158 +- ...atched_HessenbergFormQ_Serial_Internal.hpp | 17 +- ...HessenbergQR_WithShift_Serial_Internal.hpp | 30 +- ...kkosBatched_Hessenberg_Serial_Internal.hpp | 16 +- ...okkosBatched_HostLevel_Gemm_Armpl_Impl.hpp | 58 +- ...kkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp | 401 +- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 176 +- ...kkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 43 +- .../KokkosBatched_HostLevel_Gemm_Spec.hpp | 279 +- .../KokkosBatched_Householder_Serial_Impl.hpp | 8 +- ...kosBatched_Householder_Serial_Internal.hpp | 8 +- ...kosBatched_Householder_TeamVector_Impl.hpp | 9 +- ...atched_Householder_TeamVector_Internal.hpp | 16 +- ...okkosBatched_InnerGemmFixA_Serial_Impl.hpp | 517 +- ...okkosBatched_InnerGemmFixB_Serial_Impl.hpp | 504 +- ...okkosBatched_InnerGemmFixC_Serial_Impl.hpp | 538 +- .../KokkosBatched_InnerGemmFixC_Team_Impl.hpp | 51 +- .../KokkosBatched_InnerLU_Serial_Impl.hpp | 76 +- .../KokkosBatched_InnerTrsm_Serial_Impl.hpp | 472 +- .../KokkosBatched_InverseLU_Serial_Impl.hpp | 35 +- .../impl/KokkosBatched_LU_Serial_Impl.hpp | 37 +- .../impl/KokkosBatched_LU_Serial_Internal.hpp | 31 +- .../dense/impl/KokkosBatched_LU_Team_Impl.hpp | 16 +- .../impl/KokkosBatched_LU_Team_Internal.hpp | 80 +- ...ftEigenvectorFromSchur_Serial_Internal.hpp | 34 +- .../impl/KokkosBatched_Normalize_Internal.hpp | 9 +- .../impl/KokkosBatched_Pttrf_Serial_Impl.hpp | 23 +- .../KokkosBatched_Pttrf_Serial_Internal.hpp | 27 +- ...KokkosBatched_QR_FormQ_Serial_Internal.hpp | 12 +- ...osBatched_QR_FormQ_TeamVector_Internal.hpp | 17 +- .../impl/KokkosBatched_QR_Serial_Impl.hpp | 7 +- .../impl/KokkosBatched_QR_Serial_Internal.hpp | 11 +- .../impl/KokkosBatched_QR_TeamVector_Impl.hpp | 7 +- .../KokkosBatched_QR_TeamVector_Internal.hpp | 11 +- ..._QR_WithColumnPivoting_TeamVector_Impl.hpp | 16 +- ...WithColumnPivoting_TeamVector_Internal.hpp | 36 +- ...htEigenvectorFromSchur_Serial_Internal.hpp | 30 +- .../impl/KokkosBatched_SVD_Serial_Impl.hpp | 60 +- .../KokkosBatched_SVD_Serial_Internal.hpp | 139 +- ...KokkosBatched_Schur2x2_Serial_Internal.hpp | 39 +- .../KokkosBatched_Schur_Serial_Internal.hpp | 80 +- .../impl/KokkosBatched_SetIdentity_Impl.hpp | 9 +- .../KokkosBatched_SetIdentity_Internal.hpp | 20 +- .../KokkosBatched_SetTriangular_Internal.hpp | 21 +- ...kosBatched_ShiftedTrsv_Serial_Internal.hpp | 28 +- ...KokkosBatched_SolveUTV_TeamVector_Impl.hpp | 31 +- ...osBatched_SolveUTV_TeamVector_Internal.hpp | 77 +- .../impl/KokkosBatched_Tbsv_Serial_Impl.hpp | 71 +- .../KokkosBatched_Tbsv_Serial_Internal.hpp | 73 +- .../impl/KokkosBatched_Trmm_Serial_Impl.hpp | 144 +- .../KokkosBatched_Trmm_Serial_Internal.hpp | 218 +- .../impl/KokkosBatched_Trsm_Serial_Impl.hpp | 321 +- .../KokkosBatched_Trsm_Serial_Internal.hpp | 92 +- .../KokkosBatched_Trsm_TeamVector_Impl.hpp | 60 +- ...KokkosBatched_Trsm_TeamVector_Internal.hpp | 92 +- .../impl/KokkosBatched_Trsm_Team_Impl.hpp | 196 +- .../impl/KokkosBatched_Trsm_Team_Internal.hpp | 177 +- .../impl/KokkosBatched_Trsv_Serial_Impl.hpp | 222 +- .../KokkosBatched_Trsv_Serial_Internal.hpp | 99 +- .../KokkosBatched_Trsv_TeamVector_Impl.hpp | 52 +- ...KokkosBatched_Trsv_TeamVector_Internal.hpp | 71 +- .../impl/KokkosBatched_Trsv_Team_Impl.hpp | 103 +- .../impl/KokkosBatched_Trsv_Team_Internal.hpp | 89 +- .../impl/KokkosBatched_Trtri_Serial_Impl.hpp | 10 +- .../KokkosBatched_Trtri_Serial_Internal.hpp | 46 +- .../KokkosBatched_UTV_TeamVector_Impl.hpp | 17 +- .../KokkosBatched_UTV_TeamVector_Internal.hpp | 33 +- .../KokkosBatched_UpdateGivens_Internal.hpp | 5 +- .../impl/KokkosBatched_Vector_SIMD_Arith.hpp | 445 +- .../KokkosBatched_Vector_SIMD_Logical.hpp | 41 +- .../impl/KokkosBatched_Vector_SIMD_Math.hpp | 45 +- .../impl/KokkosBatched_Vector_SIMD_Misc.hpp | 76 +- .../KokkosBatched_Vector_SIMD_Relation.hpp | 40 +- .../impl/KokkosBatched_Vector_SIMD_View.hpp | 178 +- ...Batched_WilkinsonShift_Serial_Internal.hpp | 10 +- .../dense/impl/KokkosBatched_Xpay_Impl.hpp | 193 +- .../src/KokkosBatched_AddRadial_Decl.hpp | 7 +- .../KokkosBatched_ApplyHouseholder_Decl.hpp | 13 +- .../src/KokkosBatched_ApplyPivot_Decl.hpp | 7 +- .../dense/src/KokkosBatched_ApplyQ_Decl.hpp | 52 +- batched/dense/src/KokkosBatched_Axpy.hpp | 12 +- batched/dense/src/KokkosBatched_Copy_Decl.hpp | 48 +- batched/dense/src/KokkosBatched_Dot.hpp | 12 +- .../KokkosBatched_Eigendecomposition_Decl.hpp | 18 +- batched/dense/src/KokkosBatched_Gemm_Decl.hpp | 49 +- batched/dense/src/KokkosBatched_Gemv_Decl.hpp | 110 +- batched/dense/src/KokkosBatched_Gesv.hpp | 17 +- .../src/KokkosBatched_HadamardProduct.hpp | 27 +- .../src/KokkosBatched_HostLevel_Gemm.hpp | 33 +- .../KokkosBatched_HostLevel_Gemm_Handle.hpp | 39 +- .../src/KokkosBatched_Householder_Decl.hpp | 7 +- .../src/KokkosBatched_InnerGemmFixA_Decl.hpp | 16 +- .../src/KokkosBatched_InnerGemmFixB_Decl.hpp | 16 +- .../src/KokkosBatched_InnerGemmFixC_Decl.hpp | 34 +- .../dense/src/KokkosBatched_InnerLU_Decl.hpp | 6 +- .../src/KokkosBatched_InnerTrsm_Decl.hpp | 36 +- .../src/KokkosBatched_InverseLU_Decl.hpp | 20 +- .../dense/src/KokkosBatched_Kernel_Handle.hpp | 14 +- batched/dense/src/KokkosBatched_LU_Decl.hpp | 10 +- batched/dense/src/KokkosBatched_Pttrf.hpp | 3 +- batched/dense/src/KokkosBatched_QR_Decl.hpp | 18 +- ...kkosBatched_QR_WithColumnPivoting_Decl.hpp | 10 +- batched/dense/src/KokkosBatched_SVD_Decl.hpp | 12 +- .../dense/src/KokkosBatched_Scale_Decl.hpp | 50 +- .../src/KokkosBatched_SetIdentity_Decl.hpp | 6 +- batched/dense/src/KokkosBatched_Set_Decl.hpp | 50 +- .../dense/src/KokkosBatched_SolveLU_Decl.hpp | 48 +- .../dense/src/KokkosBatched_SolveUTV_Decl.hpp | 12 +- batched/dense/src/KokkosBatched_Tbsv.hpp | 6 +- batched/dense/src/KokkosBatched_Trmm_Decl.hpp | 7 +- batched/dense/src/KokkosBatched_Trsm_Decl.hpp | 38 +- batched/dense/src/KokkosBatched_Trsv_Decl.hpp | 231 +- batched/dense/src/KokkosBatched_UTV_Decl.hpp | 10 +- batched/dense/src/KokkosBatched_Vector.hpp | 34 +- .../dense/src/KokkosBatched_Vector_SIMD.hpp | 77 +- batched/dense/src/KokkosBatched_Xpay.hpp | 12 +- .../unit_test/Test_Batched_BatchedGemm.hpp | 242 +- .../Test_Batched_BatchedGemm_Complex.hpp | 118 +- .../Test_Batched_BatchedGemm_Real.hpp | 182 +- .../unit_test/Test_Batched_DenseUtils.hpp | 14 +- .../unit_test/Test_Batched_SerialAxpy.hpp | 31 +- .../Test_Batched_SerialAxpy_Complex.hpp | 3 +- .../Test_Batched_SerialAxpy_Real.hpp | 8 +- .../unit_test/Test_Batched_SerialGemm.hpp | 142 +- .../Test_Batched_SerialGemm_Complex.hpp | 48 +- .../Test_Batched_SerialGemm_Real.hpp | 112 +- .../unit_test/Test_Batched_SerialGesv.hpp | 48 +- .../Test_Batched_SerialInverseLU.hpp | 62 +- .../Test_Batched_SerialInverseLU_Complex.hpp | 6 +- .../dense/unit_test/Test_Batched_SerialLU.hpp | 14 +- .../unit_test/Test_Batched_SerialPttrf.hpp | 167 +- .../unit_test/Test_Batched_SerialSVD.hpp | 122 +- .../unit_test/Test_Batched_SerialSolveLU.hpp | 48 +- .../Test_Batched_SerialSolveLU_Complex.hpp | 6 +- .../unit_test/Test_Batched_SerialTbsv.hpp | 119 +- .../Test_Batched_SerialTbsv_Complex.hpp | 64 +- .../Test_Batched_SerialTbsv_Real.hpp | 80 +- .../unit_test/Test_Batched_SerialTrmm.hpp | 144 +- .../Test_Batched_SerialTrmm_Complex.hpp | 270 +- .../Test_Batched_SerialTrmm_Real.hpp | 198 +- .../unit_test/Test_Batched_SerialTrsm.hpp | 62 +- .../Test_Batched_SerialTrsm_Complex.hpp | 112 +- .../Test_Batched_SerialTrsm_Real.hpp | 110 +- .../unit_test/Test_Batched_SerialTrsv.hpp | 69 +- .../Test_Batched_SerialTrsv_Complex.hpp | 48 +- .../Test_Batched_SerialTrsv_Real.hpp | 36 +- .../unit_test/Test_Batched_SerialTrtri.hpp | 89 +- .../Test_Batched_SerialTrtri_Complex.hpp | 28 +- .../Test_Batched_SerialTrtri_Real.hpp | 12 +- .../dense/unit_test/Test_Batched_TeamAxpy.hpp | 48 +- .../Test_Batched_TeamAxpy_Complex.hpp | 3 +- .../unit_test/Test_Batched_TeamAxpy_Real.hpp | 4 +- .../dense/unit_test/Test_Batched_TeamGemm.hpp | 148 +- .../Test_Batched_TeamGemm_Complex.hpp | 52 +- .../unit_test/Test_Batched_TeamGemm_Real.hpp | 120 +- .../dense/unit_test/Test_Batched_TeamGesv.hpp | 61 +- .../unit_test/Test_Batched_TeamGesv_Real.hpp | 6 +- .../unit_test/Test_Batched_TeamInverseLU.hpp | 72 +- .../Test_Batched_TeamInverseLU_Complex.hpp | 6 +- .../dense/unit_test/Test_Batched_TeamLU.hpp | 26 +- .../unit_test/Test_Batched_TeamSolveLU.hpp | 60 +- .../Test_Batched_TeamSolveLU_Complex.hpp | 6 +- .../dense/unit_test/Test_Batched_TeamTrsm.hpp | 77 +- .../Test_Batched_TeamTrsm_Complex.hpp | 150 +- .../unit_test/Test_Batched_TeamTrsm_Real.hpp | 140 +- .../dense/unit_test/Test_Batched_TeamTrsv.hpp | 60 +- .../unit_test/Test_Batched_TeamVectorAxpy.hpp | 49 +- .../Test_Batched_TeamVectorAxpy_Complex.hpp | 3 +- .../unit_test/Test_Batched_TeamVectorGemm.hpp | 166 +- .../Test_Batched_TeamVectorGemm_Complex.hpp | 50 +- .../Test_Batched_TeamVectorGemm_Real.hpp | 100 +- .../unit_test/Test_Batched_TeamVectorGesv.hpp | 65 +- .../Test_Batched_TeamVectorGesv_Real.hpp | 12 +- .../unit_test/Test_Batched_TeamVectorQR.hpp | 66 +- ...atched_TeamVectorQR_WithColumnPivoting.hpp | 90 +- .../Test_Batched_TeamVectorSolveUTV.hpp | 105 +- .../Test_Batched_TeamVectorSolveUTV2.hpp | 115 +- .../unit_test/Test_Batched_TeamVectorUTV.hpp | 133 +- .../Test_Batched_VectorArithmatic.hpp | 138 +- .../unit_test/Test_Batched_VectorLogical.hpp | 54 +- .../unit_test/Test_Batched_VectorMath.hpp | 69 +- .../unit_test/Test_Batched_VectorMisc.hpp | 27 +- .../unit_test/Test_Batched_VectorRelation.hpp | 38 +- .../unit_test/Test_Batched_VectorView.hpp | 270 +- .../impl/KokkosBatched_CG_TeamVector_Impl.hpp | 91 +- .../impl/KokkosBatched_CG_Team_Impl.hpp | 92 +- .../impl/KokkosBatched_GMRES_Serial_Impl.hpp | 93 +- .../KokkosBatched_GMRES_TeamVector_Impl.hpp | 355 +- .../impl/KokkosBatched_GMRES_Team_Impl.hpp | 348 +- .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 205 +- .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 371 +- .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 307 +- batched/sparse/src/KokkosBatched_CG.hpp | 18 +- .../sparse/src/KokkosBatched_CrsMatrix.hpp | 42 +- batched/sparse/src/KokkosBatched_GMRES.hpp | 21 +- batched/sparse/src/KokkosBatched_Identity.hpp | 17 +- .../sparse/src/KokkosBatched_JacobiPrec.hpp | 40 +- .../src/KokkosBatched_Krylov_Handle.hpp | 46 +- .../src/KokkosBatched_Krylov_Solvers.hpp | 121 +- batched/sparse/src/KokkosBatched_Spmv.hpp | 140 +- .../unit_test/Test_Batched_SerialGMRES.hpp | 91 +- .../Test_Batched_SerialGMRES_Real.hpp | 8 +- .../unit_test/Test_Batched_SerialSpmv.hpp | 74 +- .../unit_test/Test_Batched_SparseUtils.hpp | 19 +- .../sparse/unit_test/Test_Batched_TeamCG.hpp | 76 +- .../unit_test/Test_Batched_TeamCG_Real.hpp | 8 +- .../unit_test/Test_Batched_TeamGMRES.hpp | 102 +- .../unit_test/Test_Batched_TeamGMRES_Real.hpp | 8 +- .../unit_test/Test_Batched_TeamSpmv.hpp | 116 +- .../unit_test/Test_Batched_TeamVectorCG.hpp | 81 +- .../Test_Batched_TeamVectorCG_Real.hpp | 8 +- .../Test_Batched_TeamVectorGMRES.hpp | 104 +- .../Test_Batched_TeamVectorGMRES_Real.hpp | 8 +- .../unit_test/Test_Batched_TeamVectorSpmv.hpp | 122 +- blas/impl/KokkosBlas1_abs_impl.hpp | 9 +- blas/impl/KokkosBlas1_abs_spec.hpp | 137 +- blas/impl/KokkosBlas1_axpby_impl.hpp | 116 +- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 212 +- blas/impl/KokkosBlas1_axpby_spec.hpp | 434 +- ...Blas1_axpby_unification_attempt_traits.hpp | 734 ++- blas/impl/KokkosBlas1_dot_impl.hpp | 18 +- blas/impl/KokkosBlas1_dot_mv_impl.hpp | 48 +- blas/impl/KokkosBlas1_dot_spec.hpp | 542 +- blas/impl/KokkosBlas1_iamax_impl.hpp | 15 +- blas/impl/KokkosBlas1_iamax_spec.hpp | 307 +- blas/impl/KokkosBlas1_mult_impl.hpp | 29 +- blas/impl/KokkosBlas1_mult_spec.hpp | 213 +- blas/impl/KokkosBlas1_nrm1_impl.hpp | 47 +- blas/impl/KokkosBlas1_nrm1_spec.hpp | 172 +- blas/impl/KokkosBlas1_nrm2_impl.hpp | 69 +- blas/impl/KokkosBlas1_nrm2_spec.hpp | 187 +- blas/impl/KokkosBlas1_nrm2w_impl.hpp | 76 +- blas/impl/KokkosBlas1_nrm2w_spec.hpp | 179 +- blas/impl/KokkosBlas1_nrminf_impl.hpp | 9 +- blas/impl/KokkosBlas1_nrminf_spec.hpp | 168 +- blas/impl/KokkosBlas1_reciprocal_impl.hpp | 9 +- blas/impl/KokkosBlas1_reciprocal_spec.hpp | 150 +- blas/impl/KokkosBlas1_rot_impl.hpp | 7 +- blas/impl/KokkosBlas1_rot_spec.hpp | 76 +- blas/impl/KokkosBlas1_rotg_impl.hpp | 46 +- blas/impl/KokkosBlas1_rotg_spec.hpp | 78 +- blas/impl/KokkosBlas1_rotm_impl.hpp | 29 +- blas/impl/KokkosBlas1_rotm_spec.hpp | 77 +- blas/impl/KokkosBlas1_rotmg_impl.hpp | 15 +- blas/impl/KokkosBlas1_rotmg_spec.hpp | 91 +- blas/impl/KokkosBlas1_scal_impl.hpp | 33 +- blas/impl/KokkosBlas1_scal_mv_impl.hpp | 184 +- blas/impl/KokkosBlas1_scal_spec.hpp | 278 +- blas/impl/KokkosBlas1_serial_scal_impl.hpp | 9 +- blas/impl/KokkosBlas1_set_impl.hpp | 71 +- blas/impl/KokkosBlas1_sum_impl.hpp | 49 +- blas/impl/KokkosBlas1_sum_spec.hpp | 148 +- blas/impl/KokkosBlas1_swap_impl.hpp | 3 +- blas/impl/KokkosBlas1_swap_spec.hpp | 70 +- blas/impl/KokkosBlas1_team_abs_spec.hpp | 12 +- blas/impl/KokkosBlas1_team_axpby_spec.hpp | 20 +- blas/impl/KokkosBlas1_team_dot_spec.hpp | 17 +- blas/impl/KokkosBlas1_team_mult_spec.hpp | 24 +- blas/impl/KokkosBlas1_team_nrm2_spec.hpp | 23 +- blas/impl/KokkosBlas1_team_scal_impl.hpp | 63 +- blas/impl/KokkosBlas1_team_scal_spec.hpp | 16 +- blas/impl/KokkosBlas1_team_update_spec.hpp | 27 +- blas/impl/KokkosBlas1_update_impl.hpp | 144 +- blas/impl/KokkosBlas1_update_spec.hpp | 252 +- blas/impl/KokkosBlas2_gemv_impl.hpp | 438 +- blas/impl/KokkosBlas2_gemv_spec.hpp | 108 +- blas/impl/KokkosBlas2_ger_impl.hpp | 118 +- blas/impl/KokkosBlas2_ger_spec.hpp | 87 +- blas/impl/KokkosBlas2_serial_gemv_impl.hpp | 100 +- ...osBlas2_serial_gemv_inner_multiple_dot.hpp | 134 +- .../impl/KokkosBlas2_serial_gemv_internal.hpp | 43 +- blas/impl/KokkosBlas2_syr2_impl.hpp | 231 +- blas/impl/KokkosBlas2_syr2_spec.hpp | 110 +- blas/impl/KokkosBlas2_syr_impl.hpp | 150 +- blas/impl/KokkosBlas2_syr_spec.hpp | 90 +- blas/impl/KokkosBlas2_team_gemv_impl.hpp | 131 +- blas/impl/KokkosBlas2_team_gemv_spec.hpp | 177 +- blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp | 56 +- blas/impl/KokkosBlas3_gemm_impl.hpp | 611 +-- blas/impl/KokkosBlas3_gemm_spec.hpp | 299 +- blas/impl/KokkosBlas3_trmm_impl.hpp | 69 +- blas/impl/KokkosBlas3_trmm_spec.hpp | 94 +- blas/impl/KokkosBlas3_trsm_impl.hpp | 287 +- blas/impl/KokkosBlas3_trsm_spec.hpp | 111 +- blas/impl/KokkosBlas_serial_axpy.hpp | 20 +- blas/impl/KokkosBlas_serial_nrm2.hpp | 19 +- blas/impl/KokkosBlas_util.hpp | 9 +- blas/src/KokkosBlas1_abs.hpp | 40 +- blas/src/KokkosBlas1_axpby.hpp | 97 +- blas/src/KokkosBlas1_dot.hpp | 156 +- blas/src/KokkosBlas1_fill.hpp | 3 +- blas/src/KokkosBlas1_iamax.hpp | 78 +- blas/src/KokkosBlas1_mult.hpp | 69 +- blas/src/KokkosBlas1_nrm1.hpp | 92 +- blas/src/KokkosBlas1_nrm2.hpp | 112 +- blas/src/KokkosBlas1_nrm2_squared.hpp | 88 +- blas/src/KokkosBlas1_nrm2w.hpp | 75 +- blas/src/KokkosBlas1_nrm2w_squared.hpp | 90 +- blas/src/KokkosBlas1_nrminf.hpp | 88 +- blas/src/KokkosBlas1_reciprocal.hpp | 40 +- blas/src/KokkosBlas1_rot.hpp | 63 +- blas/src/KokkosBlas1_rotg.hpp | 40 +- blas/src/KokkosBlas1_rotm.hpp | 63 +- blas/src/KokkosBlas1_rotmg.hpp | 47 +- blas/src/KokkosBlas1_scal.hpp | 69 +- blas/src/KokkosBlas1_set.hpp | 21 +- blas/src/KokkosBlas1_sum.hpp | 64 +- blas/src/KokkosBlas1_swap.hpp | 42 +- blas/src/KokkosBlas1_team_abs.hpp | 3 +- blas/src/KokkosBlas1_team_axpby.hpp | 17 +- blas/src/KokkosBlas1_team_dot.hpp | 6 +- blas/src/KokkosBlas1_team_mult.hpp | 10 +- blas/src/KokkosBlas1_team_nrm2.hpp | 6 +- blas/src/KokkosBlas1_team_scal.hpp | 5 +- blas/src/KokkosBlas1_team_update.hpp | 12 +- blas/src/KokkosBlas1_update.hpp | 66 +- blas/src/KokkosBlas2_gemv.hpp | 148 +- blas/src/KokkosBlas2_ger.hpp | 83 +- blas/src/KokkosBlas2_serial_gemv.hpp | 17 +- blas/src/KokkosBlas2_syr.hpp | 69 +- blas/src/KokkosBlas2_syr2.hpp | 93 +- blas/src/KokkosBlas2_team_gemv.hpp | 61 +- blas/src/KokkosBlas3_gemm.hpp | 129 +- blas/src/KokkosBlas3_trmm.hpp | 61 +- blas/src/KokkosBlas3_trsm.hpp | 61 +- blas/src/KokkosBlas_trtri.hpp | 3 +- .../tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp | 69 +- blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 583 +-- blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 74 +- blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 398 +- .../tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp | 189 +- blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp | 708 ++- blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp | 3 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 136 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 481 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 71 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 568 +-- .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 33 +- .../tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp | 328 +- blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp | 62 +- blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp | 512 +- blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp | 195 +- blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp | 946 ++-- blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp | 105 +- blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp | 396 +- .../tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp | 108 +- blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp | 443 +- blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 122 +- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 458 +- blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp | 149 +- blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp | 733 ++- .../KokkosBlas1_update_tpl_spec_avail.hpp | 3 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 175 +- blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 1172 ++--- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 218 +- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 485 +- .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 513 +- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 482 +- .../KokkosBlas2_serial_gemv_tpl_spec_decl.hpp | 76 +- blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp | 222 +- .../KokkosBlas2_syr2_tpl_spec_decl_blas.hpp | 466 +- .../KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp | 562 +-- ...KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp | 504 +- blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp | 203 +- .../KokkosBlas2_syr_tpl_spec_decl_blas.hpp | 404 +- .../KokkosBlas2_syr_tpl_spec_decl_cublas.hpp | 505 +- .../KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp | 452 +- blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 175 +- blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 689 ++- blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp | 113 +- blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp | 508 +- blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp | 113 +- blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp | 1348 +++-- blas/tpls/KokkosBlas_Cuda_tpl.hpp | 3 +- blas/tpls/KokkosBlas_Host_tpl.cpp | 837 ++- blas/tpls/KokkosBlas_Host_tpl.hpp | 61 +- blas/tpls/KokkosBlas_Rocm_tpl.hpp | 3 +- blas/tpls/KokkosBlas_tpl_spec.hpp | 53 +- blas/unit_test/Test_Blas1_abs.hpp | 56 +- blas/unit_test/Test_Blas1_asum.hpp | 29 +- blas/unit_test/Test_Blas1_axpby.hpp | 71 +- .../Test_Blas1_axpby_unification.hpp | 999 ++-- blas/unit_test/Test_Blas1_axpy.hpp | 63 +- blas/unit_test/Test_Blas1_dot.hpp | 67 +- blas/unit_test/Test_Blas1_iamax.hpp | 64 +- blas/unit_test/Test_Blas1_mult.hpp | 141 +- blas/unit_test/Test_Blas1_nrm1.hpp | 57 +- blas/unit_test/Test_Blas1_nrm2.hpp | 47 +- blas/unit_test/Test_Blas1_nrm2_squared.hpp | 56 +- blas/unit_test/Test_Blas1_nrm2w.hpp | 53 +- blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 50 +- blas/unit_test/Test_Blas1_nrminf.hpp | 47 +- blas/unit_test/Test_Blas1_reciprocal.hpp | 83 +- blas/unit_test/Test_Blas1_rot.hpp | 12 +- blas/unit_test/Test_Blas1_rotg.hpp | 15 +- blas/unit_test/Test_Blas1_rotm.hpp | 15 +- blas/unit_test/Test_Blas1_rotmg.hpp | 19 +- blas/unit_test/Test_Blas1_scal.hpp | 62 +- blas/unit_test/Test_Blas1_serial_setscal.hpp | 89 +- blas/unit_test/Test_Blas1_sum.hpp | 41 +- blas/unit_test/Test_Blas1_swap.hpp | 12 +- blas/unit_test/Test_Blas1_team_abs.hpp | 139 +- blas/unit_test/Test_Blas1_team_axpby.hpp | 150 +- blas/unit_test/Test_Blas1_team_axpy.hpp | 147 +- blas/unit_test/Test_Blas1_team_dot.hpp | 179 +- blas/unit_test/Test_Blas1_team_mult.hpp | 244 +- blas/unit_test/Test_Blas1_team_nrm2.hpp | 49 +- blas/unit_test/Test_Blas1_team_scal.hpp | 188 +- blas/unit_test/Test_Blas1_team_setscal.hpp | 103 +- blas/unit_test/Test_Blas1_team_update.hpp | 258 +- blas/unit_test/Test_Blas1_update.hpp | 152 +- blas/unit_test/Test_Blas2_gemv.hpp | 189 +- blas/unit_test/Test_Blas2_gemv_util.hpp | 129 +- blas/unit_test/Test_Blas2_ger.hpp | 800 +-- blas/unit_test/Test_Blas2_serial_gemv.hpp | 42 +- blas/unit_test/Test_Blas2_syr.hpp | 873 ++-- blas/unit_test/Test_Blas2_syr2.hpp | 980 ++-- blas/unit_test/Test_Blas2_team_gemv.hpp | 21 +- blas/unit_test/Test_Blas2_teamvector_gemv.hpp | 28 +- blas/unit_test/Test_Blas3_gemm.hpp | 198 +- blas/unit_test/Test_Blas3_trmm.hpp | 299 +- blas/unit_test/Test_Blas3_trsm.hpp | 290 +- blas/unit_test/Test_Blas_Newton.hpp | 15 +- blas/unit_test/Test_Blas_rocblas.hpp | 3 +- blas/unit_test/Test_Blas_serial_axpy.hpp | 67 +- blas/unit_test/Test_Blas_serial_nrm2.hpp | 89 +- common/impl/KokkosKernels_Iota.hpp | 10 +- common/impl/KokkosKernels_NaN.hpp | 7 +- common/impl/KokkosKernels_SafeCompare.hpp | 3 +- common/impl/KokkosKernels_ViewUtils.hpp | 8 +- common/src/KokkosKernels_BitUtils.hpp | 3 +- .../KokkosKernels_BlockHashmapAccumulator.hpp | 117 +- common/src/KokkosKernels_BlockUtils.hpp | 44 +- common/src/KokkosKernels_Error.hpp | 26 +- common/src/KokkosKernels_ExecSpaceUtils.hpp | 121 +- .../src/KokkosKernels_HashmapAccumulator.hpp | 173 +- common/src/KokkosKernels_IOUtils.hpp | 28 +- common/src/KokkosKernels_LowerBound.hpp | 95 +- common/src/KokkosKernels_Macros.hpp | 11 +- common/src/KokkosKernels_Predicates.hpp | 39 +- .../src/KokkosKernels_PrintConfiguration.hpp | 11 +- common/src/KokkosKernels_PrintUtils.hpp | 34 +- common/src/KokkosKernels_SimpleUtils.hpp | 177 +- common/src/KokkosKernels_Sorting.hpp | 398 +- common/src/KokkosKernels_TplsVersion.hpp | 6 +- ...Kernels_Uniform_Initialized_MemoryPool.hpp | 26 +- common/src/KokkosKernels_UpperBound.hpp | 13 +- common/src/KokkosKernels_Utils.hpp | 866 ++-- common/src/KokkosKernels_VectorUtils.hpp | 35 +- common/src/KokkosKernels_helpers.hpp | 40 +- common/src/Kokkos_ArithTraits.hpp | 716 +-- common/src/Kokkos_InnerProductSpaceTraits.hpp | 61 +- common/unit_test/Test_Common_AlignPtrTo.hpp | 29 +- common/unit_test/Test_Common_ArithTraits.hpp | 194 +- common/unit_test/Test_Common_Error.hpp | 3 +- common/unit_test/Test_Common_Iota.hpp | 18 +- common/unit_test/Test_Common_LowerBound.hpp | 67 +- .../Test_Common_PrintConfiguration.hpp | 4 +- common/unit_test/Test_Common_Sorting.hpp | 138 +- common/unit_test/Test_Common_UpperBound.hpp | 67 +- common/unit_test/Test_Common_Version.hpp | 3 +- common/unit_test/Test_Common_float128.hpp | 13 +- .../unit_test/Test_Common_set_bit_count.hpp | 58 +- example/batched_solve/examples_helper.hpp | 45 +- example/batched_solve/static_pivoting.cpp | 31 +- example/batched_solve/team_GMRES.cpp | 140 +- example/gmres/ex_real_A.cpp | 62 +- example/gmres/test_prec.cpp | 73 +- ...kosKernels_Example_Distance2GraphColor.cpp | 178 +- example/graph/PartitioningExample.cpp | 2 +- example/half/xpy.cpp | 21 +- ...kkosKernels_Example_HashmapAccumulator.cpp | 79 +- example/wiki/blas/abs/abs.cpp | 3 +- .../graph/KokkosGraph_wiki_9pt_stencil.hpp | 10 +- .../graph/KokkosGraph_wiki_coarsening.cpp | 5 +- .../wiki/graph/KokkosGraph_wiki_coloring.cpp | 17 +- example/wiki/graph/KokkosGraph_wiki_mis2.cpp | 19 +- example/wiki/graph/KokkosGraph_wiki_rcm.cpp | 21 +- .../sparse/KokkosSparse_wiki_bsrmatrix.cpp | 34 +- .../sparse/KokkosSparse_wiki_bsrmatrix_2.cpp | 71 +- .../sparse/KokkosSparse_wiki_crsmatrix.cpp | 22 +- .../sparse/KokkosSparse_wiki_gauss_seidel.cpp | 34 +- .../wiki/sparse/KokkosSparse_wiki_spadd.cpp | 22 +- .../wiki/sparse/KokkosSparse_wiki_spgemm.cpp | 22 +- .../wiki/sparse/KokkosSparse_wiki_spmv.cpp | 24 +- graph/impl/KokkosGraph_BFS_impl.hpp | 41 +- .../impl/KokkosGraph_Distance1Color_impl.hpp | 1136 ++--- .../impl/KokkosGraph_Distance2Color_impl.hpp | 410 +- graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 365 +- .../KokkosGraph_ExplicitCoarsening_impl.hpp | 166 +- graph/impl/KokkosGraph_color_d1_spec.hpp | 89 +- graph/src/KokkosGraph_CoarsenConstruct.hpp | 1050 ++-- graph/src/KokkosGraph_CoarsenHeuristics.hpp | 229 +- graph/src/KokkosGraph_Distance1Color.hpp | 45 +- .../src/KokkosGraph_Distance1ColorHandle.hpp | 292 +- graph/src/KokkosGraph_Distance2Color.hpp | 80 +- .../src/KokkosGraph_Distance2ColorHandle.hpp | 167 +- graph/src/KokkosGraph_ExplicitCoarsening.hpp | 68 +- graph/src/KokkosGraph_MIS2.hpp | 49 +- graph/src/KokkosGraph_Triangle.hpp | 231 +- graph/unit_test/Test_Graph_coarsen.hpp | 221 +- graph/unit_test/Test_Graph_graph_color.hpp | 118 +- .../Test_Graph_graph_color_deterministic.hpp | 75 +- .../Test_Graph_graph_color_distance2.hpp | 291 +- graph/unit_test/Test_Graph_mis2.hpp | 189 +- graph/unit_test/Test_Graph_rcm.hpp | 154 +- lapack/impl/KokkosLapack_gesv_spec.hpp | 88 +- lapack/impl/KokkosLapack_svd_spec.hpp | 114 +- lapack/impl/KokkosLapack_trtri_impl.hpp | 23 +- lapack/impl/KokkosLapack_trtri_spec.hpp | 57 +- lapack/src/KokkosLapack_gesv.hpp | 71 +- lapack/src/KokkosLapack_svd.hpp | 114 +- lapack/src/KokkosLapack_trtri.hpp | 25 +- lapack/tpls/KokkosLapack_Cuda_tpl.hpp | 3 +- lapack/tpls/KokkosLapack_Host_tpl.cpp | 128 +- lapack/tpls/KokkosLapack_Host_tpl.hpp | 14 +- lapack/tpls/KokkosLapack_cusolver.hpp | 34 +- .../tpls/KokkosLapack_gesv_tpl_spec_avail.hpp | 147 +- .../tpls/KokkosLapack_gesv_tpl_spec_decl.hpp | 546 +- .../tpls/KokkosLapack_svd_tpl_spec_avail.hpp | 174 +- .../tpls/KokkosLapack_svd_tpl_spec_decl.hpp | 617 +-- .../KokkosLapack_trtri_tpl_spec_avail.hpp | 91 +- .../tpls/KokkosLapack_trtri_tpl_spec_decl.hpp | 253 +- lapack/unit_test/Test_Lapack_gesv.hpp | 177 +- lapack/unit_test/Test_Lapack_svd.hpp | 105 +- lapack/unit_test/Test_Lapack_trtri.hpp | 107 +- ode/impl/KokkosODE_BDF_impl.hpp | 183 +- ode/impl/KokkosODE_Newton_impl.hpp | 29 +- ode/impl/KokkosODE_RungeKuttaTables_impl.hpp | 97 +- ode/impl/KokkosODE_RungeKutta_impl.hpp | 42 +- ode/src/KokkosODE_BDF.hpp | 43 +- ode/src/KokkosODE_Newton.hpp | 15 +- ode/src/KokkosODE_RungeKutta.hpp | 14 +- ode/src/KokkosODE_Types.hpp | 14 +- ode/unit_test/Test_ODE_BDF.hpp | 272 +- ode/unit_test/Test_ODE_Newton.hpp | 152 +- ode/unit_test/Test_ODE_RK.hpp | 197 +- ode/unit_test/Test_ODE_RK_chem.hpp | 50 +- perf_test/Benchmark_Context.hpp | 42 +- .../KokkosKernels_perf_test_instantiation.hpp | 7 +- .../KokkosKernels_perf_test_utilities.hpp | 69 +- perf_test/PerfTestUtilities.cpp | 4 +- perf_test/PerfTestUtilities.hpp | 37 +- ...okkosBatched_Test_BlockJacobi_Tutorial.cpp | 92 +- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 530 +- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 471 +- .../KokkosBatched_Test_Gemm_Cuda.cpp | 330 +- .../KokkosBatched_Test_Gemm_Host.hpp | 185 +- .../KokkosBatched_Test_Gemv_Host.hpp | 110 +- .../do-not-use/KokkosBatched_Test_LU_Cuda.cpp | 233 +- .../do-not-use/KokkosBatched_Test_LU_Host.hpp | 101 +- .../KokkosBatched_Test_Trsm_Cuda.cpp | 476 +- .../KokkosBatched_Test_Trsm_Host.hpp | 204 +- .../CG/Functor_TestBatchedTeamVectorCG_1.hpp | 35 +- .../CG/Functor_TestBatchedTeamVectorCG_2.hpp | 58 +- .../CG/Functor_TestBatchedTeamVectorCG_3.hpp | 57 +- .../sparse/CG/KokkosBatched_Test_CG.cpp | 172 +- .../Functor_TestBatchedTeamVectorGMRES_1.hpp | 56 +- .../Functor_TestBatchedTeamVectorGMRES_2.hpp | 99 +- .../Functor_TestBatchedTeamVectorGMRES_3.hpp | 100 +- .../sparse/GMRES/KokkosBatched_Test_GMRES.cpp | 290 +- .../KokkosBatched_Test_Sparse_Helper.hpp | 30 +- .../sparse/SPMV/KokkosBatched_SPMV_View.hpp | 212 +- .../sparse/SPMV/KokkosBatched_Test_SPMV.cpp | 247 +- .../KokkosBatched_Test_cusolverDn.cpp | 73 +- .../KokkosBatched_Test_cusolverSp.cpp | 168 +- perf_test/blas/KokkosBlas_blas1.cpp | 34 +- perf_test/blas/KokkosBlas_blas1_MV.cpp | 69 +- .../blas1/KokkosBlas_dot_mv_perf_test.cpp | 28 +- .../KokkosBlas_dot_mv_perf_test_benchmark.cpp | 18 +- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 22 +- .../blas/blas1/KokkosBlas_dot_perf_test.hpp | 6 +- .../KokkosBlas_dot_perf_test_benchmark.cpp | 15 +- .../KokkosBlas_dot_tracked_perf_test.cpp | 7 +- perf_test/blas/blas1/KokkosBlas_perf_test.cpp | 54 +- .../blas1/KokkosBlas_team_dot_perf_test.cpp | 29 +- .../blas1/KokkosBlas_team_dot_perf_test.hpp | 28 +- ...okkosBlas_team_dot_perf_test_benchmark.cpp | 25 +- .../KokkosBlas_team_dot_tracked_perf_test.cpp | 15 +- perf_test/blas/blas1/tracked_testing.hpp | 9 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 43 +- .../blas/blas2/KokkosBlas2_gemv_perf_test.hpp | 15 +- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 51 +- .../KokkosBlas2_gemv_tracked_perf_test.cpp | 4 +- .../KokkosBlas2_ger_perf_test_benchmark.cpp | 139 +- perf_test/blas/blas2/tracked_testing.hpp | 6 +- perf_test/blas/blas3/KokkosBlas3_common.hpp | 13 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1340 ++--- .../KokkosBlas3_gemm_standalone_perf_test.cpp | 24 +- ...s3_gemm_standalone_perf_test_benchmark.cpp | 70 +- .../KokkosBlas3_gemm_tracked_perf_test.cpp | 15 +- .../KokkosBlas3_gemm_tracked_perf_test.hpp | 18 +- .../blas/blas3/KokkosBlas3_perf_test.cpp | 92 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 308 +- .../blas/blas3/KokkosBlas_trtri_perf_test.hpp | 186 +- perf_test/blas/blas3/tracked_testing.hpp | 9 +- perf_test/graph/KokkosGraph_color.cpp | 260 +- perf_test/graph/KokkosGraph_color_d2.cpp | 288 +- perf_test/graph/KokkosGraph_mis_d2.cpp | 61 +- perf_test/graph/KokkosGraph_triangle.cpp | 120 +- .../lapack/KokkosLapack_SVD_benchmark.cpp | 26 +- perf_test/ode/KokkosODE_BDF.cpp | 64 +- perf_test/ode/KokkosODE_RK.cpp | 113 +- perf_test/performance/performance_example.cpp | 30 +- .../performance/performance_validate.cpp | 166 +- perf_test/sparse/KokkosSparse_block_pcg.cpp | 194 +- perf_test/sparse/KokkosSparse_gs.cpp | 124 +- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 38 +- perf_test/sparse/KokkosSparse_mdf.cpp | 69 +- perf_test/sparse/KokkosSparse_par_ilut.cpp | 144 +- perf_test/sparse/KokkosSparse_pcg.cpp | 70 +- perf_test/sparse/KokkosSparse_pcg.hpp | 136 +- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 258 +- perf_test/sparse/KokkosSparse_spadd.cpp | 255 +- perf_test/sparse/KokkosSparse_spgemm.cpp | 223 +- .../sparse/KokkosSparse_spgemm_jacobi.cpp | 63 +- perf_test/sparse/KokkosSparse_spiluk.cpp | 205 +- perf_test/sparse/KokkosSparse_spmv.cpp | 51 +- .../sparse/KokkosSparse_spmv_benchmark.cpp | 59 +- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 169 +- .../KokkosSparse_spmv_bsr_benchmark.cpp | 123 +- perf_test/sparse/KokkosSparse_spmv_merge.cpp | 72 +- perf_test/sparse/KokkosSparse_spmv_struct.cpp | 99 +- .../KokkosSparse_spmv_struct_tuning.cpp | 280 +- perf_test/sparse/KokkosSparse_spmv_test.cpp | 29 +- perf_test/sparse/KokkosSparse_spmv_test.hpp | 78 +- perf_test/sparse/KokkosSparse_sptrsv.cpp | 273 +- perf_test/sparse/KokkosSparse_sptrsv_aux.hpp | 296 +- .../sparse/KokkosSparse_sptrsv_cholmod.cpp | 175 +- .../sparse/KokkosSparse_sptrsv_superlu.cpp | 299 +- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 114 +- perf_test/sparse/spmv/CuSparse_SPMV.hpp | 33 +- perf_test/sparse/spmv/KokkosKernels_SPMV.hpp | 24 +- .../sparse/spmv/KokkosKernels_spmv_data.hpp | 6 +- perf_test/sparse/spmv/Kokkos_SPMV.hpp | 112 +- .../sparse/spmv/Kokkos_SPMV_Inspector.hpp | 56 +- perf_test/sparse/spmv/MKL_SPMV.hpp | 22 +- perf_test/sparse/spmv/OpenMPDynamic_SPMV.hpp | 3 +- .../sparse/spmv/OpenMPSmartStatic_SPMV.hpp | 17 +- perf_test/sparse/spmv/OpenMPStatic_SPMV.hpp | 3 +- perf_test/sparse/spmv/matrix_market.hpp | 61 +- perf_test/sparse/tracked_testing.hpp | 3 +- perf_test/test_crsmatrix.cpp | 177 +- perf_test/test_mv.cpp | 59 +- sparse/impl/KokkosSparse_bspgemm_impl.hpp | 92 +- sparse/impl/KokkosSparse_bspgemm_impl_def.hpp | 30 +- .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 961 ++-- sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp | 68 +- .../impl/KokkosSparse_bspgemm_impl_speed.hpp | 382 +- .../KokkosSparse_bspgemm_numeric_spec.hpp | 457 +- sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp | 24 +- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 463 +- sparse/impl/KokkosSparse_coo2crs_impl.hpp | 82 +- .../KokkosSparse_crs_detect_block_size.hpp | 14 +- sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp | 47 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 1342 ++--- .../impl/KokkosSparse_gauss_seidel_spec.hpp | 656 +-- ...kkosSparse_getDiagCopyWithOffsets_impl.hpp | 61 +- sparse/impl/KokkosSparse_gmres_impl.hpp | 115 +- sparse/impl/KokkosSparse_gmres_spec.hpp | 135 +- sparse/impl/KokkosSparse_mdf_impl.hpp | 376 +- sparse/impl/KokkosSparse_merge_matrix.hpp | 64 +- .../KokkosSparse_par_ilut_numeric_impl.hpp | 500 +- .../KokkosSparse_par_ilut_numeric_spec.hpp | 258 +- .../KokkosSparse_par_ilut_symbolic_impl.hpp | 10 +- .../KokkosSparse_par_ilut_symbolic_spec.hpp | 149 +- .../impl/KokkosSparse_partitioning_impl.hpp | 104 +- .../impl/KokkosSparse_sor_sequential_impl.hpp | 48 +- .../impl/KokkosSparse_spadd_numeric_impl.hpp | 157 +- .../impl/KokkosSparse_spadd_numeric_spec.hpp | 251 +- .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 380 +- .../impl/KokkosSparse_spadd_symbolic_spec.hpp | 168 +- sparse/impl/KokkosSparse_spgemm_imp_outer.hpp | 371 +- sparse/impl/KokkosSparse_spgemm_impl.hpp | 486 +- .../impl/KokkosSparse_spgemm_impl_color.hpp | 592 +-- .../KokkosSparse_spgemm_impl_compression.hpp | 829 ++- sparse/impl/KokkosSparse_spgemm_impl_def.hpp | 140 +- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 1044 ++-- .../KokkosSparse_spgemm_impl_memaccess.hpp | 476 +- sparse/impl/KokkosSparse_spgemm_impl_seq.hpp | 81 +- .../impl/KokkosSparse_spgemm_impl_speed.hpp | 355 +- .../KokkosSparse_spgemm_impl_symbolic.hpp | 1823 +++---- .../KokkosSparse_spgemm_impl_triangle.hpp | 1499 +++--- ...se_spgemm_impl_triangle_no_compression.hpp | 811 ++- ...kkosSparse_spgemm_jacobi_denseacc_impl.hpp | 226 +- .../KokkosSparse_spgemm_jacobi_seq_impl.hpp | 58 +- ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 1042 ++-- .../impl/KokkosSparse_spgemm_jacobi_spec.hpp | 299 +- .../impl/KokkosSparse_spgemm_noreuse_spec.hpp | 114 +- .../impl/KokkosSparse_spgemm_numeric_spec.hpp | 256 +- .../KokkosSparse_spgemm_symbolic_spec.hpp | 178 +- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 401 +- .../impl/KokkosSparse_spiluk_numeric_spec.hpp | 327 +- .../KokkosSparse_spiluk_symbolic_impl.hpp | 106 +- .../KokkosSparse_spiluk_symbolic_spec.hpp | 203 +- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 956 ++-- .../KokkosSparse_spmv_bsrmatrix_impl_v42.hpp | 27 +- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 350 +- sparse/impl/KokkosSparse_spmv_impl.hpp | 778 ++- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 178 +- sparse/impl/KokkosSparse_spmv_impl_omp.hpp | 19 +- sparse/impl/KokkosSparse_spmv_spec.hpp | 295 +- sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 1016 ++-- sparse/impl/KokkosSparse_spmv_struct_spec.hpp | 257 +- sparse/impl/KokkosSparse_spmv_team_impl.hpp | 132 +- sparse/impl/KokkosSparse_spmv_team_spec.hpp | 48 +- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 388 +- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 1117 ++-- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 258 +- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 210 +- .../KokkosSparse_sptrsv_symbolic_spec.hpp | 102 +- sparse/impl/KokkosSparse_trsv_impl.hpp | 110 +- sparse/impl/KokkosSparse_trsv_spec.hpp | 184 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 369 +- sparse/src/KokkosKernels_Controls.hpp | 38 +- sparse/src/KokkosKernels_Handle.hpp | 390 +- sparse/src/KokkosSparse_BsrMatrix.hpp | 333 +- sparse/src/KokkosSparse_CcsMatrix.hpp | 67 +- sparse/src/KokkosSparse_CooMatrix.hpp | 33 +- sparse/src/KokkosSparse_CrsMatrix.hpp | 245 +- sparse/src/KokkosSparse_IOUtils.hpp | 426 +- sparse/src/KokkosSparse_LUPrec.hpp | 68 +- sparse/src/KokkosSparse_MatrixPrec.hpp | 8 +- sparse/src/KokkosSparse_OrdinalTraits.hpp | 16 +- sparse/src/KokkosSparse_Preconditioner.hpp | 8 +- sparse/src/KokkosSparse_SortCrs.hpp | 432 +- sparse/src/KokkosSparse_Utils.hpp | 1390 ++--- sparse/src/KokkosSparse_Utils_cusparse.hpp | 55 +- sparse/src/KokkosSparse_Utils_mkl.hpp | 139 +- sparse/src/KokkosSparse_Utils_rocsparse.hpp | 61 +- sparse/src/KokkosSparse_ccs2crs.hpp | 69 +- sparse/src/KokkosSparse_coo2crs.hpp | 36 +- sparse/src/KokkosSparse_crs2ccs.hpp | 69 +- sparse/src/KokkosSparse_crs2coo.hpp | 82 +- sparse/src/KokkosSparse_findRelOffset.hpp | 12 +- sparse/src/KokkosSparse_gauss_seidel.hpp | 756 ++- .../src/KokkosSparse_gauss_seidel_handle.hpp | 401 +- sparse/src/KokkosSparse_getDiagCopy.hpp | 35 +- sparse/src/KokkosSparse_gmres.hpp | 122 +- sparse/src/KokkosSparse_gmres_handle.hpp | 26 +- sparse/src/KokkosSparse_mdf.hpp | 126 +- sparse/src/KokkosSparse_mdf_handle.hpp | 18 +- sparse/src/KokkosSparse_par_ilut.hpp | 468 +- sparse/src/KokkosSparse_par_ilut_handle.hpp | 72 +- sparse/src/KokkosSparse_spadd.hpp | 275 +- sparse/src/KokkosSparse_spadd_handle.hpp | 18 +- sparse/src/KokkosSparse_spgemm.hpp | 120 +- sparse/src/KokkosSparse_spgemm_handle.hpp | 264 +- sparse/src/KokkosSparse_spgemm_jacobi.hpp | 141 +- sparse/src/KokkosSparse_spgemm_numeric.hpp | 157 +- sparse/src/KokkosSparse_spgemm_symbolic.hpp | 94 +- sparse/src/KokkosSparse_spiluk.hpp | 1005 ++-- sparse/src/KokkosSparse_spiluk_handle.hpp | 69 +- sparse/src/KokkosSparse_spmv.hpp | 674 +-- sparse/src/KokkosSparse_spmv_deprecated.hpp | 229 +- sparse/src/KokkosSparse_spmv_handle.hpp | 113 +- sparse/src/KokkosSparse_spmv_team.hpp | 90 +- sparse/src/KokkosSparse_sptrsv.hpp | 534 +- sparse/src/KokkosSparse_sptrsv_cholmod.hpp | 84 +- sparse/src/KokkosSparse_sptrsv_handle.hpp | 294 +- sparse/src/KokkosSparse_sptrsv_superlu.hpp | 94 +- sparse/src/KokkosSparse_sptrsv_supernode.hpp | 547 +- sparse/src/KokkosSparse_trsv.hpp | 70 +- sparse/tpls/KokkosKernels_tpl_handles_def.hpp | 4 +- ...kkosSparse_gauss_seidel_tpl_spec_avail.hpp | 7 +- .../KokkosSparse_gmres_tpl_spec_avail.hpp | 3 +- ...Sparse_par_ilut_numeric_tpl_spec_avail.hpp | 6 +- ...parse_par_ilut_symbolic_tpl_spec_avail.hpp | 3 +- ...kkosSparse_spadd_numeric_tpl_spec_decl.hpp | 420 +- ...kosSparse_spadd_symbolic_tpl_spec_decl.hpp | 345 +- .../KokkosSparse_spadd_tpl_spec_avail.hpp | 168 +- ...kosSparse_spgemm_jacobi_tpl_spec_avail.hpp | 7 +- ...osSparse_spgemm_noreuse_tpl_spec_avail.hpp | 45 +- ...kosSparse_spgemm_noreuse_tpl_spec_decl.hpp | 199 +- ...osSparse_spgemm_numeric_tpl_spec_avail.hpp | 180 +- ...kosSparse_spgemm_numeric_tpl_spec_decl.hpp | 660 +-- ...sSparse_spgemm_symbolic_tpl_spec_avail.hpp | 118 +- ...osSparse_spgemm_symbolic_tpl_spec_decl.hpp | 606 +-- ...osSparse_spiluk_numeric_tpl_spec_avail.hpp | 7 +- ...sSparse_spiluk_symbolic_tpl_spec_avail.hpp | 5 +- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 334 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 1017 ++-- .../KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 165 +- .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 399 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 280 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 725 +-- ...kkosSparse_sptrsv_solve_tpl_spec_avail.hpp | 4 +- .../tpls/KokkosSparse_trsv_tpl_spec_avail.hpp | 3 +- sparse/unit_test/Test_Sparse_BsrMatrix.hpp | 86 +- sparse/unit_test/Test_Sparse_Controls.hpp | 3 +- sparse/unit_test/Test_Sparse_CrsMatrix.hpp | 92 +- sparse/unit_test/Test_Sparse_IOUtils.hpp | 69 +- sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 45 +- sparse/unit_test/Test_Sparse_SortCrs.hpp | 278 +- .../Test_Sparse_TestUtils_RandCsMat.hpp | 9 +- sparse/unit_test/Test_Sparse_Transpose.hpp | 180 +- sparse/unit_test/Test_Sparse_Utils.hpp | 47 +- .../Test_Sparse_block_gauss_seidel.hpp | 190 +- sparse/unit_test/Test_Sparse_bspgemm.hpp | 116 +- sparse/unit_test/Test_Sparse_ccs2crs.hpp | 51 +- sparse/unit_test/Test_Sparse_coo2crs.hpp | 111 +- sparse/unit_test/Test_Sparse_crs2ccs.hpp | 51 +- sparse/unit_test/Test_Sparse_crs2coo.hpp | 42 +- sparse/unit_test/Test_Sparse_csc2csr.hpp | 37 +- .../Test_Sparse_extractCrsDiagonalBlocks.hpp | 39 +- .../unit_test/Test_Sparse_findRelOffset.hpp | 98 +- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 576 +-- sparse/unit_test/Test_Sparse_gmres.hpp | 41 +- sparse/unit_test/Test_Sparse_mdf.hpp | 133 +- sparse/unit_test/Test_Sparse_par_ilut.hpp | 125 +- .../Test_Sparse_removeCrsMatrixZeros.hpp | 123 +- .../unit_test/Test_Sparse_replaceSumInto.hpp | 66 +- .../Test_Sparse_replaceSumIntoLonger.hpp | 135 +- sparse/unit_test/Test_Sparse_rocsparse.hpp | 7 +- sparse/unit_test/Test_Sparse_spadd.hpp | 147 +- sparse/unit_test/Test_Sparse_spgemm.hpp | 258 +- .../unit_test/Test_Sparse_spgemm_jacobi.hpp | 108 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 292 +- sparse/unit_test/Test_Sparse_spmv.hpp | 607 +-- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 227 +- sparse/unit_test/Test_Sparse_sptrsv.hpp | 101 +- sparse/unit_test/Test_Sparse_trsv.hpp | 81 +- sparse/unit_test/Test_vector_fixtures.hpp | 40 +- sparse/unit_test/matrixIssue402.hpp | 4476 +++++++---------- test_common/KokkosKernels_MatrixConverter.cpp | 48 +- test_common/KokkosKernels_MyCRSMatrix.hpp | 80 +- test_common/KokkosKernels_TestUtils.hpp | 294 +- .../KokkosKernels_Test_Structured_Matrix.hpp | 423 +- ...KokkosKernels_WriteBinaryFromBinSrcDst.cpp | 41 +- test_common/Kokkos_Performance.hpp | 138 +- .../Test_Common_Test_All_Type_Combos.hpp | 51 +- test_common/Test_Cuda.hpp | 6 +- test_common/Test_HIP.hpp | 8 +- 873 files changed, 51793 insertions(+), 85746 deletions(-) create mode 100644 .git-blame-ignore-revs diff --git a/.clang-format b/.clang-format index db5f94fa2e..ca42ad54e7 100644 --- a/.clang-format +++ b/.clang-format @@ -4,3 +4,4 @@ SortIncludes: false AlignConsecutiveAssignments: true AllowShortCaseLabelsOnASingleLine: true AllowShortIfStatementsOnASingleLine: true +ColumnLimit: 120 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..07973335d9 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,17 @@ +# +# Add formatting and other uninteresting commits here and +# 'git blame $file' will skip them IF +# A) you have it configured to do so globally (dangerous, this will break git blame on repos that don't have a .git-blame-ignore-revs file. +# git config --global blame.ignoreRevsFile .git-blame-ignore-revs +# OR +# B) you have it configured to do so locally (tedious, you have to remember to do this in every KK clone) +# git config blame.ignoreRevsFile .git-blame-ignore-revs +# OR +# C) you have an aliased blame command for KokkosKernels: +# git config --global alias.kkblame 'blame --ignore-revs-file=.git-blame-ignore-revs' +# NOTE: this implies you run 'git kkblame $file' +# OR +# D) you explicitly tell blame to skip them +# git blame --ignore-revs-file=.git-blame-ignore-revs $file +# +# List skips here: diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 08b541587f..2819fd1554 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -11,12 +11,12 @@ permissions: jobs: clang-format-check: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Install Dependencies - run: sudo apt install clang-format-8 + run: sudo apt install clang-format-16 - name: check run: | @@ -26,7 +26,7 @@ jobs: # For every file changed, apply clang-format for file in $(git diff --name-only origin/$GITHUB_BASE_REF | egrep '.*\.cpp$|.*\.hpp$|.*\.h$'); do if [ -e $file ]; then - clang-format-8 -i -style=file $file + clang-format-16 -i -style=file $file git add $file fi done diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index fc14bd5a19..8a1cb0e01b 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -48,7 +48,7 @@ #define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ 1 #define __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ 1 #include "mkl.h" -//#include "mkl_types.h" +// #include "mkl_types.h" #endif #endif @@ -71,12 +71,11 @@ struct is_vector : public std::false_type {}; template struct is_same_mag_type { - static const bool is_specialized = (Kokkos::ArithTraits::is_specialized && - Kokkos::ArithTraits::is_specialized); + static const bool is_specialized = + (Kokkos::ArithTraits::is_specialized && Kokkos::ArithTraits::is_specialized); static const bool is_mag_type_same = - std::is_same::mag_type, - typename Kokkos::ArithTraits::mag_type>::value; + std::is_same::mag_type, typename Kokkos::ArithTraits::mag_type>::value; static const bool value = is_specialized && is_mag_type_same; }; @@ -87,42 +86,36 @@ using std::min; // view manipulation template -using MemoryTraits = Kokkos::MemoryTraits; template -using UnmanagedViewType = Kokkos::View< - typename ViewType::data_type, typename ViewType::array_layout, - typename ViewType::device_type, - MemoryTraits >; +using UnmanagedViewType = + Kokkos::View >; template -using ConstViewType = Kokkos::View< - typename ViewType::const_data_type, typename ViewType::array_layout, - typename ViewType::device_type, typename ViewType::memory_traits>; +using ConstViewType = Kokkos::View; template using ConstUnmanagedViewType = ConstViewType >; template -using ScratchViewType = Kokkos::View< - typename ViewType::data_type, typename ViewType::array_layout, - typename ViewType::execution_space::scratch_memory_space, - MemoryTraits >; +using ScratchViewType = Kokkos::View >; // helper for vector type template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, size_t>::type - adjustDimension(const size_t &m) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, size_t>::type adjustDimension( + const size_t &m) { return m; } template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, size_t>::type - adjustDimension(const size_t &m) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, size_t>::type adjustDimension( + const size_t &m) { return (m / T::vector_length + (m % T::vector_length > 0)); } @@ -132,9 +125,7 @@ struct Flush { // flush a large host buffer Kokkos::View _buf; - Flush() : _buf("Flush::buf", BufSize / sizeof(double)) { - Kokkos::deep_copy(_buf, 1); - } + Flush() : _buf("Flush::buf", BufSize / sizeof(double)) { Kokkos::deep_copy(_buf, 1); } KOKKOS_INLINE_FUNCTION void init(value_type &update) { update = 0; } @@ -147,9 +138,7 @@ struct Flush { void run() { double sum = 0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, - sum); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, sum); SpaceType().fence(); FILE *fp = fopen("/dev/null", "w"); fprintf(fp, "%f\n", sum); @@ -161,9 +150,7 @@ template struct Random; template -struct Random::value || - std::is_same::value, - T>::type> { +struct Random::value || std::is_same::value, T>::type> { Random(const unsigned int seed = 0) { srand(seed); } T value() { const auto val = (rand() / ((T)RAND_MAX) - 0.5) * 2.0; @@ -172,18 +159,16 @@ struct Random::value || }; template -struct Random >::value || - std::is_same >::value || - std::is_same >::value || - std::is_same >::value, - T>::type> { +struct Random< + T, typename std::enable_if< + std::is_same >::value || std::is_same >::value || + std::is_same >::value || std::is_same >::value, + T>::type> { Random(const unsigned int seed = 0) { srand(seed); } T value() { const auto rval = (rand() / ((double)RAND_MAX) - 0.5) * 2.0; const auto ival = (rand() / ((double)RAND_MAX) - 0.5) * 2.0; - return T(rval > 0 ? rval + 1.0e-3 : rval - 1.0e-3, - ival > 0 ? ival + 1.0e-3 : ival - 1.0e-3); + return T(rval > 0 ? rval + 1.0e-3 : rval - 1.0e-3, ival > 0 ? ival + 1.0e-3 : ival - 1.0e-3); } }; @@ -199,23 +184,18 @@ struct Timer { const double t = _clock.seconds(); std::string label = _label; label.resize(24); - std::cout << "KokkosKernels::Timer:: " << std::setw(26) << label - << std::setw(15) << std::scientific << t << " [sec] " - << std::endl; + std::cout << "KokkosKernels::Timer:: " << std::setw(26) << label << std::setw(15) << std::scientific << t + << " [sec] " << std::endl; } }; // Implicit vectorization template struct SIMD { - static_assert(std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same >::value || - std::is_same >::value || - std::is_same >::value || - std::is_same >::value || + static_assert(std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same >::value || std::is_same >::value || + std::is_same >::value || std::is_same >::value || std::is_same::value || std::is_same::value, "KokkosKernels:: Invalid SIMD<> type."); @@ -225,10 +205,8 @@ struct SIMD { // Intel AVX instruction device (explicit vectorization) template struct AVX { - static_assert(std::is_same::value || - std::is_same::value || - std::is_same >::value || - std::is_same >::value, + static_assert(std::is_same::value || std::is_same::value || + std::is_same >::value || std::is_same >::value, "KokkosKernels:: Invalid AVX<> type."); using value_type = T; }; @@ -304,17 +282,15 @@ using KokkosBlas::Mode; struct Util { template - KOKKOS_INLINE_FUNCTION static void packColMajor( - ValueType *KOKKOS_RESTRICT A, const int m, const int n, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_INLINE_FUNCTION static void packColMajor(ValueType *KOKKOS_RESTRICT A, const int m, const int n, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { for (int j = 0; j < n; ++j) for (int i = 0; i < m; ++i) A[i + j * m] = B[i * bs0 + j * bs1]; } template - KOKKOS_INLINE_FUNCTION static void packRowMajor( - ValueType *KOKKOS_RESTRICT A, const int m, const int n, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_INLINE_FUNCTION static void packRowMajor(ValueType *KOKKOS_RESTRICT A, const int m, const int n, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { for (int i = 0; i < m; ++i) for (int j = 0; j < n; ++j) A[i * n + j] = B[i * bs0 + j * bs1]; } @@ -366,8 +342,7 @@ struct Partition1x3 { ValueType *A0, *A1, *A2; KOKKOS_INLINE_FUNCTION - Partition1x3(const int arg_as1) - : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {} + Partition1x3(const int arg_as1) : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {} KOKKOS_INLINE_FUNCTION void partWithAL(const Partition1x2 &part, const int mA1) { @@ -403,9 +378,7 @@ struct Partition2x1 { } KOKKOS_INLINE_FUNCTION - void partWithAB(ValueType *A, const int mA, const int mAB) { - partWithAT(A, mA, mA - mAB); - } + void partWithAB(ValueType *A, const int mA, const int mAB) { partWithAT(A, mA, mA - mAB); } // A0 // A1 is merged into AT @@ -430,8 +403,7 @@ struct Partition3x1 { /* */ *A2; KOKKOS_INLINE_FUNCTION - Partition3x1(const int arg_as0) - : as0(arg_as0), A0(NULL), A1(NULL), A2(NULL) {} + Partition3x1(const int arg_as0) : as0(arg_as0), A0(NULL), A1(NULL), A2(NULL) {} KOKKOS_INLINE_FUNCTION void partWithAB(const Partition2x1 &part, const int mA1) { @@ -460,16 +432,10 @@ struct Partition2x2 { KOKKOS_INLINE_FUNCTION Partition2x2(const int arg_as0, const int arg_as1) - : as0(arg_as0), - as1(arg_as1), - ATL(NULL), - ATR(NULL), - ABL(NULL), - ABR(NULL) {} + : as0(arg_as0), as1(arg_as1), ATL(NULL), ATR(NULL), ABL(NULL), ABR(NULL) {} KOKKOS_INLINE_FUNCTION - void partWithATL(ValueType *A, const int /* mA */, const int /* nA */, - const int mATL, const int nATL) { + void partWithATL(ValueType *A, const int /* mA */, const int /* nA */, const int mATL, const int nATL) { ATL = A; ATR = ATL + nATL * as1; ABL = ATL + mATL * as0; @@ -477,8 +443,7 @@ struct Partition2x2 { } KOKKOS_INLINE_FUNCTION - void partWithABR(ValueType *A, const int mA, const int nA, const int mABR, - const int nABR) { + void partWithABR(ValueType *A, const int mA, const int nA, const int mABR, const int nABR) { partWithATL(A, mA, nA, mA - mABR, nA - nABR); } @@ -523,8 +488,7 @@ struct Partition3x3 { A22(NULL) {} KOKKOS_INLINE_FUNCTION - void partWithABR(const Partition2x2 &part, const int mA11, - const int nA11) { + void partWithABR(const Partition2x2 &part, const int mA11, const int nA11) { A00 = part.ATL; A01 = part.ATR; A02 = part.ATR + nA11 * as1; @@ -537,8 +501,7 @@ struct Partition3x3 { } KOKKOS_INLINE_FUNCTION - void partWithATL(const Partition2x2 &part, const int mA11, - const int nA11) { + void partWithATL(const Partition2x2 &part, const int mA11, const int nA11) { A00 = part.ATL; A01 = part.ATR - nA11 * as1; A02 = part.ATR; @@ -552,94 +515,74 @@ struct Partition3x3 { }; template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, - void>::type - getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, - const OrdinalType numMatrices, OrdinalType &iRow, - OrdinalType &iMatrix) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, void>::type getIndices( + const OrdinalType iTemp, const OrdinalType /*numRows*/, const OrdinalType numMatrices, OrdinalType &iRow, + OrdinalType &iMatrix) { iRow = iTemp / numMatrices; iMatrix = iTemp % numMatrices; } template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, - void>::type - getIndices(const OrdinalType iTemp, const OrdinalType numRows, - const OrdinalType /*numMatrices*/, OrdinalType &iRow, - OrdinalType &iMatrix) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, void>::type getIndices( + const OrdinalType iTemp, const OrdinalType numRows, const OrdinalType /*numMatrices*/, OrdinalType &iRow, + OrdinalType &iMatrix) { iRow = iTemp % numRows; iMatrix = iTemp / numRows; } template -KOKKOS_INLINE_FUNCTION - typename std::enable_if::value, - void>::type - getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, - const OrdinalType numMatrices, OrdinalType &iRow, - OrdinalType &iMatrix) { +KOKKOS_INLINE_FUNCTION typename std::enable_if::value, void>::type +getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, const OrdinalType numMatrices, OrdinalType &iRow, + OrdinalType &iMatrix) { iRow = iTemp / numMatrices; iMatrix = iTemp % numMatrices; } template KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) { - constexpr int rank = 2; - const int dim[] = {v.extent_int(1), v.extent_int(0)}; - using view_value_type = typename ViewType::value_type; - using execution_space_type = typename ViewType::execution_space; - using view_type = Kokkos::View; - Kokkos::LayoutStride stride = - Kokkos::LayoutStride::order_dimensions(rank, order, dim); + constexpr int rank = 2; + const int dim[] = {v.extent_int(1), v.extent_int(0)}; + using view_value_type = typename ViewType::value_type; + using execution_space_type = typename ViewType::execution_space; + using view_type = Kokkos::View; + Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions(rank, order, dim); return view_type(v.data(), stride); } template -KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, - const BatchLayout::Left &) { +KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const BatchLayout::Left &) { const int order[] = {0, 1}; // v is LayoutRight return transpose_2d_view(v, order); } template -KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, - const BatchLayout::Right &) { +KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const BatchLayout::Right &) { const int order[] = {1, 0}; // v is LayoutLeft return transpose_2d_view(v, order); } ///// subview_wrapper overloads for handling 3-rank BatchLayout::Left views template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, const BatchLayout::Left &) { return Kokkos::subview(v, i1, i2, i3); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, - const BatchLayout::Left &layout_tag, - const Trans::NoTranspose) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Left &layout_tag, const Trans::NoTranspose) { return subview_wrapper(v, i1, i2, i3, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::ALL_t i2, Kokkos::ALL_t i3, - const BatchLayout::Left &layout_tag, - const Trans::Transpose) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, + const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return transpose_2d_view(sv_nt, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, - const BatchLayout::Left &layout_tag, - const Trans::Transpose) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return sv_nt; @@ -647,29 +590,25 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, //// subview_wrapper overloads for handling 3-rank BatchLayout::Right views template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - IdxType2 i2, IdxType3 i3, +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, const BatchLayout::Right &) { return Kokkos::subview(v, i2, i3, i1); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, - const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { return subview_wrapper(v, i1, i2, i3, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, - const BatchLayout::Right &layout_tag, const Trans::Transpose &) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, + const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return transpose_2d_view(sv_nt, layout_tag); } template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, - const BatchLayout::Right &layout_tag, const Trans::Transpose &) { +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, + const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); return sv_nt; @@ -686,71 +625,48 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( * otherwise, the last element of v. */ template -KOKKOS_INLINE_FUNCTION ViewValueType -access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) { - return v(KOKKOSKERNELS_MACRO_MIN(m, v.extent_int(0) - 1), - KOKKOSKERNELS_MACRO_MIN(n, v.extent_int(1) - 1)); +KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) { + return v(KOKKOSKERNELS_MACRO_MIN(m, v.extent_int(0) - 1), KOKKOSKERNELS_MACRO_MIN(n, v.extent_int(1) - 1)); } template -KOKKOS_INLINE_FUNCTION ViewValueType -access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::No &) { +KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::No &) { return v(m, n); } template -KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, - ScalarType alpha, - const AlphaTag::Yes &) { +KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, ScalarType alpha, const AlphaTag::Yes &) { return reg_c * alpha; } template -KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, - ScalarType /*alpha*/, - const AlphaTag::No &) { +KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, ScalarType /*alpha*/, const AlphaTag::No &) { return reg_c; } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, ScalarType beta, - const ArgAlphaFmaTag &alpha_tag, +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + ScalarType beta, const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::Yes &) { - if (m < v.extent_int(0) && n < v.extent_int(1)) - v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta; + if (m < v.extent_int(0) && n < v.extent_int(1)) v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta; } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, ScalarType beta, - const ArgAlphaFmaTag &alpha_tag, +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + ScalarType beta, const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::No &) { v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta; } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, - const ArgAlphaFmaTag &alpha_tag, - const BoundsCheck::Yes &) { - if (m < v.extent_int(0) && n < v.extent_int(1)) - v(m, n) = fma_alpha(reg_c, alpha, alpha_tag); +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::Yes &) { + if (m < v.extent_int(0) && n < v.extent_int(1)) v(m, n) = fma_alpha(reg_c, alpha, alpha_tag); } -template -KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, - ViewValueType reg_c, - ScalarType alpha, - const ArgAlphaFmaTag &alpha_tag, - const BoundsCheck::No &) { +template +KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, ViewValueType reg_c, ScalarType alpha, + const ArgAlphaFmaTag &alpha_tag, const BoundsCheck::No &) { v(m, n) = fma_alpha(reg_c, alpha, alpha_tag); } diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp index 252c78d5c5..d89a82ae2c 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp @@ -28,11 +28,9 @@ namespace KokkosBatched { /// =========== template -KOKKOS_INLINE_FUNCTION int SerialAddRadial::invoke(const ScalarType tiny, - const AViewType &A) { - return SerialAddRadialInternal::invoke( - (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, A.data(), - (A.stride_0() + A.stride_1())); +KOKKOS_INLINE_FUNCTION int SerialAddRadial::invoke(const ScalarType tiny, const AViewType &A) { + return SerialAddRadialInternal::invoke((A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, A.data(), + (A.stride_0() + A.stride_1())); } /// @@ -41,11 +39,10 @@ KOKKOS_INLINE_FUNCTION int SerialAddRadial::invoke(const ScalarType tiny, template template -KOKKOS_INLINE_FUNCTION int TeamAddRadial::invoke( - const MemberType &member, const ScalarType tiny, const AViewType &A) { - return TeamAddRadialInternal::invoke( - member, (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, - A.data(), (A.stride_0() + A.stride_1())); +KOKKOS_INLINE_FUNCTION int TeamAddRadial::invoke(const MemberType &member, const ScalarType tiny, + const AViewType &A) { + return TeamAddRadialInternal::invoke(member, (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, A.data(), + (A.stride_0() + A.stride_1())); } } // end namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp index 24ecafe0a0..634879530e 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp @@ -28,8 +28,7 @@ namespace KokkosBatched { struct SerialAddRadialInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType tiny, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as) { const auto abs_tiny = tiny > 0 ? tiny : -tiny; const auto minus_abs_tiny = -abs_tiny; @@ -52,10 +51,8 @@ struct SerialAddRadialInternal { /// ================== struct TeamAddRadialInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType tiny, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType tiny, + /* */ ValueType *KOKKOS_RESTRICT A, const int as) { const auto abs_tiny = tiny > 0 ? tiny : -tiny; const auto minus_abs_tiny = -abs_tiny; diff --git a/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp index cf8a946e99..2d3d2af915 100644 --- a/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp @@ -30,10 +30,9 @@ namespace KokkosBatched { /// struct SerialApplyLeftGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair G, const int n, - /* */ ValueType *a1t, const int a1ts, - /* */ ValueType *a2t, const int a2ts) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair G, const int n, + /* */ ValueType *a1t, const int a1ts, + /* */ ValueType *a2t, const int a2ts) { typedef ValueType value_type; if (n == 0) return 0; // quick return if (G.first == value_type(1) && G.second == value_type(0)) return 0; @@ -59,10 +58,9 @@ struct SerialApplyLeftGivensInternal { struct SerialApplyRightGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair G, const int m, - /* */ ValueType *a1, const int a1s, - /* */ ValueType *a2, const int a2s) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair G, const int m, + /* */ ValueType *a1, const int a1s, + /* */ ValueType *a2, const int a2s) { typedef ValueType value_type; if (m == 0) return 0; // quick return if (G.first == value_type(1) && G.second == value_type(0)) return 0; @@ -88,12 +86,11 @@ struct SerialApplyRightGivensInternal { struct SerialApplyLeftRightGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair &G12, const int &m, const int &n, - /* */ ValueType *KOKKOS_RESTRICT a1t, - /* */ ValueType *KOKKOS_RESTRICT a2t, - /* */ ValueType *KOKKOS_RESTRICT a1, - /* */ ValueType *KOKKOS_RESTRICT a2, const int &as0, const int &as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair &G12, const int &m, const int &n, + /* */ ValueType *KOKKOS_RESTRICT a1t, + /* */ ValueType *KOKKOS_RESTRICT a2t, + /* */ ValueType *KOKKOS_RESTRICT a1, + /* */ ValueType *KOKKOS_RESTRICT a2, const int &as0, const int &as1) { typedef ValueType value_type; if (G12.first == value_type(1) && G12.second == value_type(0)) return 0; if (m == 0 && n == 0) return 0; // quick return @@ -124,15 +121,14 @@ struct SerialApplyLeftRightGivensInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair &G12, - const Kokkos::pair &G13, const int &m, const int &n, - /* */ ValueType *KOKKOS_RESTRICT a1t, - /* */ ValueType *KOKKOS_RESTRICT a2t, - /* */ ValueType *KOKKOS_RESTRICT a3t, - /* */ ValueType *KOKKOS_RESTRICT a1, - /* */ ValueType *KOKKOS_RESTRICT a2, - /* */ ValueType *KOKKOS_RESTRICT a3, const int &as0, const int &as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair &G12, + const Kokkos::pair &G13, const int &m, const int &n, + /* */ ValueType *KOKKOS_RESTRICT a1t, + /* */ ValueType *KOKKOS_RESTRICT a2t, + /* */ ValueType *KOKKOS_RESTRICT a3t, + /* */ ValueType *KOKKOS_RESTRICT a1, + /* */ ValueType *KOKKOS_RESTRICT a2, + /* */ ValueType *KOKKOS_RESTRICT a3, const int &as0, const int &as1) { typedef ValueType value_type; if (m == 0 && n == 0) return 0; // quick return diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp index be720bef2e..db85d96680 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp @@ -28,27 +28,21 @@ namespace KokkosBatched { /// =========== template <> -template -KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke( - const uViewType &u2, const tauViewType &tau, const AViewType &A, - const wViewType &w) { - return SerialApplyLeftHouseholderInternal::invoke( - A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), u2.stride(0), - A.data(), A.stride(1), A.data() + A.stride(0), A.stride(0), A.stride(1), - w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke(const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return SerialApplyLeftHouseholderInternal::invoke(A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), u2.stride(0), + A.data(), A.stride(1), A.data() + A.stride(0), A.stride(0), + A.stride(1), w.data()); } template <> -template -KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke( - const uViewType &u2, const tauViewType &tau, const AViewType &A, - const wViewType &w) { - return SerialApplyRightHouseholderInternal::invoke( - A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), u2.stride(0), - A.data(), A.stride(0), A.data() + A.stride(1), A.stride(0), A.stride(1), - w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder::invoke(const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return SerialApplyRightHouseholderInternal::invoke(A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), u2.stride(0), + A.data(), A.stride(0), A.data() + A.stride(1), A.stride(0), + A.stride(1), w.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index 611e9440b5..e129fef5a5 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -30,12 +30,10 @@ namespace KokkosBatched { /// struct SerialApplyLeftHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ValueType* tau, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType* tau, /* */ ValueType* u2, const int u2s, /* */ ValueType* a1t, const int a1ts, - /* */ ValueType* A2, const int as0, - const int as1, + /* */ ValueType* A2, const int as0, const int as1, /* */ ValueType* w1t) { typedef ValueType value_type; @@ -55,9 +53,7 @@ struct SerialApplyLeftHouseholderInternal { // w1t /= tau for (int j = 0; j < n; ++j) { value_type tmp = a1t[j * a1ts]; - for (int i = 0; i < m; ++i) - tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * - A2[i * as0 + j * as1]; + for (int i = 0; i < m; ++i) tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; w1t[j] = tmp * inv_tau; // /= (*tau); } @@ -74,12 +70,10 @@ struct SerialApplyLeftHouseholderInternal { struct SerialApplyRightHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ValueType* tau, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType* tau, /* */ ValueType* u2, const int u2s, /* */ ValueType* a1, const int a1s, - /* */ ValueType* A2, const int as0, - const int as1, + /* */ ValueType* A2, const int as0, const int as1, /* */ ValueType* w1) { typedef ValueType value_type; /// u2 n x 1 @@ -107,9 +101,7 @@ struct SerialApplyRightHouseholderInternal { // A2 -= w1 * u2' (ger with conjugate) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) - A2[i * as0 + j * as1] -= - w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp index d1dcc58d18..b322574ad0 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp @@ -29,33 +29,23 @@ namespace KokkosBatched { template struct TeamVectorApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const uViewType &u2, - const tauViewType &tau, - const AViewType &A, - const wViewType &w) { - return TeamVectorApplyLeftHouseholderInternal::invoke( - member, A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), - u2.stride(0), A.data(), A.stride(1), A.data() + A.stride(0), - A.stride(0), A.stride(1), w.data()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return TeamVectorApplyLeftHouseholderInternal::invoke(member, A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), + u2.stride(0), A.data(), A.stride(1), A.data() + A.stride(0), + A.stride(0), A.stride(1), w.data()); } }; template struct TeamVectorApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const uViewType &u2, - const tauViewType &tau, - const AViewType &A, - const wViewType &w) { - return TeamVectorApplyRightHouseholderInternal::invoke( - member, A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), - u2.stride(0), A.data(), A.stride(0), A.data() + A.stride(1), - A.stride(0), A.stride(1), w.data()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const uViewType &u2, const tauViewType &tau, + const AViewType &A, const wViewType &w) { + return TeamVectorApplyRightHouseholderInternal::invoke(member, A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), + u2.stride(0), A.data(), A.stride(0), A.data() + A.stride(1), + A.stride(0), A.stride(1), w.data()); } }; diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp index 2754818fbf..2474a10fe3 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp @@ -30,13 +30,10 @@ namespace KokkosBatched { /// struct TeamVectorApplyLeftHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ValueType *tau, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ValueType *tau, /* */ ValueType *u2, const int u2s, /* */ ValueType *a1t, const int a1ts, - /* */ ValueType *A2, const int as0, - const int as1, + /* */ ValueType *A2, const int as0, const int as1, /* */ ValueType *w1t) { typedef ValueType value_type; @@ -59,8 +56,7 @@ struct TeamVectorApplyLeftHouseholderInternal { Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, m), [&](const int &i, value_type &val) { - val += Kokkos::ArithTraits::conj(u2[i * u2s]) * - A2[i * as0 + j * as1]; + val += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; }, tmp); Kokkos::single(Kokkos::PerThread(member), [&]() { @@ -70,26 +66,19 @@ struct TeamVectorApplyLeftHouseholderInternal { member.team_barrier(); // a1t -= w1t (axpy) - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const int &j) { a1t[j * a1ts] -= w1t[j]; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { a1t[j * a1ts] -= w1t[j]; }); // A2 -= u2 w1t (ger) if (as0 <= as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), + [&](const int &i) { A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; - }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; }); + }); } return 0; @@ -98,13 +87,10 @@ struct TeamVectorApplyLeftHouseholderInternal { struct TeamVectorApplyRightHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ValueType *tau, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ValueType *tau, /* */ ValueType *u2, const int u2s, /* */ ValueType *a1, const int a1s, - /* */ ValueType *A2, const int as0, - const int as1, + /* */ ValueType *A2, const int as0, const int as1, /* */ ValueType *w1) { typedef ValueType value_type; /// u2 n x 1 @@ -125,10 +111,7 @@ struct TeamVectorApplyRightHouseholderInternal { value_type tmp(0); Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, n), - [&](const int &j, value_type &val) { - val += A2[i * as0 + j * as1] * u2[j * u2s]; - }, - tmp); + [&](const int &j, value_type &val) { val += A2[i * as0 + j * as1] * u2[j * u2s]; }, tmp); Kokkos::single(Kokkos::PerThread(member), [&]() { w1[i] = (tmp + a1[i * a1s]) * inv_tau; // \= (*tau); }); @@ -136,28 +119,21 @@ struct TeamVectorApplyRightHouseholderInternal { member.team_barrier(); // a1 -= w1 (axpy) - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { a1[i * a1s] -= w1[i]; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { a1[i * a1s] -= w1[i]; }); // A2 -= w1 * u2' (ger with conjugate) if (as0 <= as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= - w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - A2[i * as0 + j * as1] -= - w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); - }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + }); + }); } return 0; diff --git a/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp index afc518f43c..10455f65b6 100644 --- a/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp @@ -35,34 +35,26 @@ namespace KokkosBatched { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, piv, - A.data(), as0, as1); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, piv, A.data(), as0, as1); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal::invoke( - member, plen, piv.data(), ps0, A.data(), as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, plen, piv.data(), ps0, A.data(), as0); } else if (AViewType::rank == 2) { // row permutation - const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal::invoke( - member, n, plen, piv.data(), ps0, A.data(), as0, as1); + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, plen, piv.data(), ps0, A.data(), as0, as1); } return 0; } @@ -72,34 +64,26 @@ struct TeamVectorApplyPivot { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, piv, - A.data(), as1, as0); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, piv, A.data(), as1, as0); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType &piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType &piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorForwardInternal ::invoke( - member, plen, piv.data(), A.data(), as0); + TeamVectorApplyPivotVectorForwardInternal ::invoke(member, plen, piv.data(), A.data(), as0); } else if (AViewType::rank == 2) { // column permutation - const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixForwardInternal ::invoke( - member, m, plen, piv.data(), ps, A.data(), as1, as0); + const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixForwardInternal ::invoke(member, m, plen, piv.data(), ps, A.data(), as1, as0); } return 0; } @@ -113,34 +97,26 @@ struct TeamVectorApplyPivot { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, piv, - A.data(), as0, as1); + TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, piv, A.data(), as0, as1); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal::invoke( - member, plen, piv.data(), ps0, A.data(), as0); + TeamVectorApplyPivotVectorBackwardInternal::invoke(member, plen, piv.data(), ps0, A.data(), as0); } else if (AViewType::rank == 2) { // row permutation - const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal::invoke( - member, n, plen, piv.data(), ps0, A.data(), as0, as1); + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, plen, piv.data(), ps0, A.data(), as0, as1); } return 0; } @@ -150,34 +126,26 @@ struct TeamVectorApplyPivot { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A) { if (AViewType::rank == 1) { const int as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), - as0); + TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), as0); } else if (AViewType::rank == 2) { const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, m, piv, - A.data(), as1, as0); + TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, m, piv, A.data(), as1, as0); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType &piv, - const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType &piv, const AViewType &A) { if (AViewType::rank == 1) { const int plen = piv.extent(0), as0 = A.stride(0); - TeamVectorApplyPivotVectorBackwardInternal ::invoke( - member, plen, piv.data(), A.data(), as0); + TeamVectorApplyPivotVectorBackwardInternal ::invoke(member, plen, piv.data(), A.data(), as0); } else if (AViewType::rank == 2) { // column permutation - const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), - as0 = A.stride(0), as1 = A.stride(1); - TeamVectorApplyPivotMatrixBackwardInternal ::invoke( - member, m, plen, piv.data(), ps, A.data(), as1, as0); + const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1); + TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, m, plen, piv.data(), ps, A.data(), as1, as0); } return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp index 59548c3d26..a301382108 100644 --- a/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp @@ -31,10 +31,8 @@ namespace KokkosBatched { /// struct TeamVectorApplyPivotVectorForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { if (piv != 0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { const int idx_p = piv * as0; @@ -47,12 +45,9 @@ struct TeamVectorApplyPivotVectorForwardInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int plen, - const IntType *KOKKOS_RESTRICT p, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { for (int i = 0; i < plen; ++i) { const int piv = p[i * ps0]; @@ -71,30 +66,24 @@ struct TeamVectorApplyPivotVectorForwardInternal { /// Pivot a row struct TeamVectorApplyPivotMatrixForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (piv != 0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const int &j) { - ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; - const int idx_p = piv * as0; - const ValueType tmp = A_at_j[0]; - A_at_j[0] = A_at_j[idx_p]; - A_at_j[idx_p] = tmp; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const int idx_p = piv * as0; + const ValueType tmp = A_at_j[0]; + A_at_j[0] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + }); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int plen, - const IntType *KOKKOS_RESTRICT p, - const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int plen, + const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; for (int i = 0; i < plen; ++i) { @@ -116,10 +105,8 @@ struct TeamVectorApplyPivotMatrixForwardInternal { /// struct TeamVectorApplyPivotVectorBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { if (piv != 0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { const int idx_p = piv * as0; @@ -132,12 +119,9 @@ struct TeamVectorApplyPivotVectorBackwardInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int plen, - const IntType *KOKKOS_RESTRICT p, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { Kokkos::single(Kokkos::PerTeam(member), [&]() { for (int i = (plen - 1); i >= 0; --i) { const int piv = p[i * ps0]; @@ -156,30 +140,24 @@ struct TeamVectorApplyPivotVectorBackwardInternal { /// Pivot a row struct TeamVectorApplyPivotMatrixBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int piv, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int piv, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (piv != 0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const int &j) { - ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; - const int idx_p = piv * as0; - const ValueType tmp = A_at_j[0]; - A_at_j[0] = A_at_j[idx_p]; - A_at_j[idx_p] = tmp; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const int idx_p = piv * as0; + const ValueType tmp = A_at_j[0]; + A_at_j[0] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + }); } return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int n, const int plen, - const IntType *KOKKOS_RESTRICT p, - const int ps0, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const int plen, + const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; for (int i = (plen - 1); i >= 0; --i) { diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp index 2a7519f2dc..ba9d85350f 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp @@ -28,42 +28,30 @@ namespace KokkosBatched { /// =========== template <> -template -KOKKOS_INLINE_FUNCTION int -SerialApplyQ::invoke( - const AViewType &A, const tViewType &t, const BViewType &B, - const wViewType &w) { - return SerialApplyQ_LeftForwardInternal::invoke( - B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyQ::invoke( + const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { + return SerialApplyQ_LeftForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), + B.stride_1(), w.data()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialApplyQ::invoke( - const AViewType &A, const tViewType &t, const BViewType &B, - const wViewType &w) { - return SerialApplyQ_LeftBackwardInternal::invoke( - B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyQ::invoke( + const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { + return SerialApplyQ_LeftBackwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), + B.stride_1(), w.data()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialApplyQ::invoke( - const AViewType &A, const tViewType &t, const BViewType &B, - const wViewType &w) { - return SerialApplyQ_RightForwardInternal::invoke( - B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +template +KOKKOS_INLINE_FUNCTION int SerialApplyQ::invoke( + const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { + return SerialApplyQ_RightForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), + B.stride_1(), w.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp index e8d6905964..dbb11df747 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp @@ -32,13 +32,10 @@ namespace KokkosBatched { struct SerialApplyQ_LeftForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int k, - /* */ ValueType *A, const int as0, - const int as1, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, - const int bs1, + /* */ ValueType *B, const int bs0, const int bs1, /* */ ValueType *w) { typedef ValueType value_type; @@ -75,9 +72,8 @@ struct SerialApplyQ_LeftForwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, - as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0, + bs1, w); /// ----------------------------------------------------- A_part2x2.mergeToABR(A_part3x3); @@ -90,13 +86,10 @@ struct SerialApplyQ_LeftForwardInternal { struct SerialApplyQ_LeftBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int k, - /* */ ValueType *A, const int as0, - const int as1, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, - const int bs1, + /* */ ValueType *B, const int bs0, const int bs1, /* */ ValueType *w) { typedef ValueType value_type; @@ -133,9 +126,8 @@ struct SerialApplyQ_LeftBackwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, - as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0, + bs1, w); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); @@ -148,13 +140,10 @@ struct SerialApplyQ_LeftBackwardInternal { struct SerialApplyQ_RightForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int k, - /* */ ValueType *A, const int as0, - const int as1, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, - const int bs1, + /* */ ValueType *B, const int bs0, const int bs1, /* */ ValueType *w) { typedef ValueType value_type; @@ -191,9 +180,8 @@ struct SerialApplyQ_RightForwardInternal { const int n_B2 = n - n_A0 - 1; /// ----------------------------------------------------- // right apply householder to partitioned B1 and B2 - SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21, - as0, B_part1x3.A1, bs0, - B_part1x3.A2, bs0, bs1, w); + SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, B_part1x3.A2, + bs0, bs1, w); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp index 7f3a695d75..d6abd61a78 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp @@ -28,53 +28,35 @@ namespace KokkosBatched { /// =============== template -struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { - return TeamVectorApplyQ_LeftForwardInternal::invoke( - member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +struct TeamVectorApplyQ { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { + return TeamVectorApplyQ_LeftForwardInternal::invoke(member, B.extent(0), B.extent(1), A.extent(1), A.data(), + A.stride_0(), A.stride_1(), t.data(), t.stride_0(), B.data(), + B.stride_0(), B.stride_1(), w.data()); } }; template -struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { - return TeamVectorApplyQ_LeftBackwardInternal::invoke( - member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +struct TeamVectorApplyQ { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { + return TeamVectorApplyQ_LeftBackwardInternal::invoke(member, B.extent(0), B.extent(1), A.extent(1), A.data(), + A.stride_0(), A.stride_1(), t.data(), t.stride_0(), B.data(), + B.stride_0(), B.stride_1(), w.data()); } }; template -struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { - return TeamVectorApplyQ_RightForwardInternal::invoke( - member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(), - A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(), - B.stride_1(), w.data()); +struct TeamVectorApplyQ { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { + return TeamVectorApplyQ_RightForwardInternal::invoke(member, B.extent(0), B.extent(1), A.extent(1), A.data(), + A.stride_0(), A.stride_1(), t.data(), t.stride_0(), B.data(), + B.stride_0(), B.stride_1(), w.data()); } }; diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp index 233daa8978..8fc6c8a78a 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp @@ -32,12 +32,11 @@ namespace KokkosBatched { struct TeamVectorApplyQ_LeftForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; /// Given a matrix A that includes a series of householder vectors, @@ -73,9 +72,8 @@ struct TeamVectorApplyQ_LeftForwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, + B_part3x1.A2, bs0, bs1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToABR(A_part3x3); @@ -88,12 +86,11 @@ struct TeamVectorApplyQ_LeftForwardInternal { struct TeamVectorApplyQ_LeftBackwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; /// Given a matrix A that includes a series of householder vectors, @@ -129,9 +126,8 @@ struct TeamVectorApplyQ_LeftBackwardInternal { const int m_A2 = m - m_A0 - 1; /// ----------------------------------------------------- // left apply householder to partitioned B1 and B2 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, - B_part3x1.A2, bs0, bs1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, + B_part3x1.A2, bs0, bs1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); @@ -144,12 +140,11 @@ struct TeamVectorApplyQ_LeftBackwardInternal { struct TeamVectorApplyQ_RightForwardInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; /// Given a matrix A that includes a series of householder vectors, @@ -185,9 +180,8 @@ struct TeamVectorApplyQ_RightForwardInternal { const int n_B2 = n - n_A0 - 1; /// ----------------------------------------------------- // right apply householder to partitioned B1 and B2 - TeamVectorApplyRightHouseholderInternal::invoke( - member, m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, - B_part1x3.A2, bs0, bs1, w); + TeamVectorApplyRightHouseholderInternal::invoke(member, m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, + B_part1x3.A2, bs0, bs1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index da9d607241..6d65ebc294 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -28,11 +28,9 @@ namespace KokkosBatched { /// ==================== struct SerialAxpyInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -42,10 +40,9 @@ struct SerialAxpyInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -55,17 +52,14 @@ struct SerialAxpyInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, - const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, - const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, + const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, + const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (xs0 > xs1) - for (int i = 0; i < m; ++i) - invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + for (int i = 0; i < m; ++i) invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); else - for (int j = 0; j < n; ++j) - invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + for (int j = 0; j < n; ++j) invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); return 0; } @@ -76,50 +70,38 @@ struct SerialAxpyInternal { /// ==================== struct TeamAxpyInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { Y[i * ys0] += alpha * X[i * xs0]; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int& i) { Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int& i) { - SerialAxpyInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, - Y + i * ys0, ys1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { + SerialAxpyInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int& j) { - SerialAxpyInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, - Y + j * ys1, ys0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int& j) { + SerialAxpyInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + }); } // member.team_barrier(); return 0; @@ -131,45 +113,35 @@ struct TeamAxpyInternal { /// ======================== struct TeamVectorAxpyInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { Y[i * ys0] += alpha * X[i * xs0]; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { - Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), + [&](const int& i) { Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; }); // member.team_barrier(); return 0; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - Y[i * ys0 + j * ys1] += alpha[i * alphas0] * X[i * xs0 + j * xs1]; - }); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + Y[i * ys0 + j * ys1] += alpha[i * alphas0] * X[i * xs0 + j * xs1]; + }); // member.team_barrier(); return 0; } @@ -180,22 +152,14 @@ struct TeamVectorAxpyInternal { /// =========== template -KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, - const XViewType& X, - const YViewType& Y) { +KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, const XViewType& X, const YViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::axpy: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -217,11 +181,10 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, // No need to check if X.extent(0)==1 in the serial case as we don't // parallelize the kernel anyway. - return SerialAxpyInternal::template invoke< - typename alphaViewType::non_const_value_type, - typename XViewType::non_const_value_type>( - X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), - X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1()); + return SerialAxpyInternal::template invoke( + X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -230,22 +193,15 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, template template -KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( - const MemberType& member, const alphaViewType& alpha, const XViewType& X, - const YViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke(const MemberType& member, const alphaViewType& alpha, + const XViewType& X, const YViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::axpy: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -265,18 +221,15 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( #endif if (X.extent(0) == 1) { - KokkosBlas::Experimental::axpy( - member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + KokkosBlas::Experimental::axpy(member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamAxpyInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename XViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), - Y.stride_1()); + return TeamAxpyInternal::template invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -285,22 +238,15 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( template template -KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( - const MemberType& member, const alphaViewType& alpha, const XViewType& X, - const YViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke(const MemberType& member, const alphaViewType& alpha, + const XViewType& X, const YViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::axpy: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -320,19 +266,15 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( #endif if (X.extent(0) == 1) { - KokkosBlas::Experimental::axpy( - member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + KokkosBlas::Experimental::axpy(member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamVectorAxpyInternal::invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename XViewType::non_const_value_type, - typename XViewType::array_layout>(member, X.extent(0), X.extent(1), - alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1()); + return TeamVectorAxpyInternal::invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index 0a8c9d456f..e11106cc24 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -29,33 +29,24 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { - return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { + return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { - return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { + return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -66,24 +57,18 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( return 1; } #endif - return SerialCopyInternal::invoke(A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1(), B.data(), + return SerialCopyInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } template <> template -KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( - const AViewType &A, const BViewType &B) { +KOKKOS_INLINE_FUNCTION int SerialCopy::invoke(const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -94,8 +79,7 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( return 1; } #endif - return SerialCopyInternal::invoke(A.extent(1), A.extent(0), A.data(), - A.stride_1(), A.stride_0(), B.data(), + return SerialCopyInternal::invoke(A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } @@ -106,40 +90,28 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), - B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -147,18 +119,15 @@ struct TeamCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(0) == 1) { - return TeamCopy::invoke( - member, Kokkos::subview(A, 0, Kokkos::ALL), - Kokkos::subview(B, 0, Kokkos::ALL)); + return TeamCopy::invoke(member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); } - return TeamCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1(), B.data(), + return TeamCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -166,18 +135,12 @@ struct TeamCopy { template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -185,18 +148,15 @@ struct TeamCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(1) == 1) { - return TeamCopy::invoke( - member, Kokkos::subview(A, Kokkos::ALL, 0), - Kokkos::subview(B, Kokkos::ALL, 0)); + return TeamCopy::invoke(member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); } - return TeamCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), - A.stride_1(), A.stride_0(), B.data(), + return TeamCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -208,40 +168,28 @@ struct TeamCopy { template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), - A.stride_0(), B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { - return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), - A.stride_0(), B.data(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(), B.data(), B.stride_0()); } }; template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -249,18 +197,15 @@ struct TeamVectorCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(0) == 1) { - return TeamVectorCopy::invoke( - member, Kokkos::subview(A, 0, Kokkos::ALL), - Kokkos::subview(B, 0, Kokkos::ALL)); + return TeamVectorCopy::invoke(member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); } - return TeamVectorCopyInternal::invoke(member, A.extent(0), A.extent(1), - A.data(), A.stride_0(), A.stride_1(), + return TeamVectorCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -268,18 +213,12 @@ struct TeamVectorCopy { template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::copy: BViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::copy: AViewType must have rank 2."); - static_assert(BViewType::rank == 2, - "KokkosBatched::copy: BViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, "KokkosBatched::copy: BViewType must have rank 2."); // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { @@ -287,18 +226,15 @@ struct TeamVectorCopy { "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), - (int)B.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); return 1; } #endif if (A.extent(1) == 1) { - return TeamVectorCopy::invoke( - member, Kokkos::subview(A, Kokkos::ALL, 0), - Kokkos::subview(B, Kokkos::ALL, 0)); + return TeamVectorCopy::invoke(member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); } - return TeamVectorCopyInternal::invoke(member, A.extent(1), A.extent(0), - A.data(), A.stride_1(), A.stride_0(), + return TeamVectorCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Copy_Internal.hpp b/batched/dense/impl/KokkosBatched_Copy_Internal.hpp index ca59e4f79c..004c62646a 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Internal.hpp @@ -28,9 +28,8 @@ namespace KokkosBatched { struct SerialCopyInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -39,10 +38,9 @@ struct SerialCopyInternal { return 0; } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int m, const int n, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int m, const int n, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { if (as1 < as0) for (int i = 0; i < m; ++i) invoke(n, A + i * as0, as1, B + i * bs0, bs1); else @@ -56,30 +54,23 @@ struct SerialCopyInternal { /// ================== struct TeamCopyInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), - [&](const int &i) { B[i * bs0] = A[i * as0]; }); + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { B[i * bs0] = A[i * as0]; }); // member.team_barrier(); return 0; } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { if (m >= n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - SerialCopyInternal::invoke(n, A + i * as0, as1, B + i * bs0, bs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { SerialCopyInternal::invoke(n, A + i * as0, as1, B + i * bs0, bs1); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - SerialCopyInternal::invoke(m, A + j * as1, as0, B + j * bs1, bs0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { SerialCopyInternal::invoke(m, A + j * as1, as0, B + j * bs1, bs0); }); } // member.team_barrier(); return 0; @@ -91,36 +82,27 @@ struct TeamCopyInternal { /// ======================== struct TeamVectorCopyInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { B[i * bs0] = A[i * as0]; }); + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { B[i * bs0] = A[i * as0]; }); // member.team_barrier(); return 0; } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { if (as0 > as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { - B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), - [&](const int &j) { - B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; - }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { B[i * bs0 + j * bs1] = A[i * as0 + j * as1]; }); + }); } // member.team_barrier(); return 0; diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index a0960c621b..48d1b1f1ac 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -31,10 +31,9 @@ struct SerialDotInternal { // i \in [0,m) // C = conj(A(:))*B(:) template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, - const ValueType *KOKKOS_RESTRICT B, const int bs0, - /* */ MagnitudeType *KOKKOS_RESTRICT C) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, + const ValueType *KOKKOS_RESTRICT B, const int bs0, + /* */ MagnitudeType *KOKKOS_RESTRICT C) { using ats = Kokkos::ArithTraits; C[0] = ValueType(0); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -50,13 +49,11 @@ struct SerialDotInternal { // j \in [0,n), i \in [0,m) // C(j) = conj(A(:,j))*B(:,j) template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, - const int bs0, const int bs1, - /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { - for (int j = 0; j < n; ++j) - invoke(m, A + j * as1, as0, B + j * bs1, bs0, C + j * cs); + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const int bs1, + /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { + for (int j = 0; j < n; ++j) invoke(m, A + j * as1, as0, B + j * bs1, bs0, C + j * cs); return 0; } }; @@ -69,10 +66,10 @@ struct SerialDotInternal { // C = conj(A(:))*B(:) struct TeamDotInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, const ValueType *KOKKOS_RESTRICT B, const int bs0, - /* */ MagnitudeType *KOKKOS_RESTRICT C) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + const ValueType *KOKKOS_RESTRICT B, const int bs0, + /* */ MagnitudeType *KOKKOS_RESTRICT C) { using ats = Kokkos::ArithTraits; ValueType t(0); Kokkos::parallel_reduce( @@ -89,11 +86,10 @@ struct TeamDotInternal { // j \in [0,n), i \in [0,m) // C(j) = conj(A(:,j))*B(:,j) template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, + /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { using ats = Kokkos::ArithTraits; Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { ValueType t(0); @@ -117,10 +113,10 @@ struct TeamDotInternal { // C = conj(A(:))*B(:) struct TeamVectorDotInternal { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, - const int as0, const ValueType *KOKKOS_RESTRICT B, const int bs0, - /* */ MagnitudeType *KOKKOS_RESTRICT C) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const ValueType *KOKKOS_RESTRICT A, const int as0, + const ValueType *KOKKOS_RESTRICT B, const int bs0, + /* */ MagnitudeType *KOKKOS_RESTRICT C) { using ats = Kokkos::ArithTraits; ValueType t(0); Kokkos::parallel_reduce( @@ -137,11 +133,10 @@ struct TeamVectorDotInternal { // j \in [0,n), i \in [0,m) // C(j) = conj(A(:,j))*B(:,j) template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, + /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) { using ats = Kokkos::ArithTraits; Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { ValueType t(0); @@ -167,30 +162,21 @@ struct TeamVectorDotInternal { template <> struct SerialDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const NormViewType &dot) { + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(1) != dot.extent(0)) { @@ -202,41 +188,31 @@ struct SerialDot { return 1; } #endif - return SerialDotInternal::template invoke< - typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0()); + return SerialDotInternal::template invoke( + X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + dot.data(), dot.stride_0()); } }; template <> struct SerialDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const NormViewType &dot) { + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != dot.extent(0)) { @@ -247,11 +223,10 @@ struct SerialDot { return 1; } #endif - return SerialDotInternal::template invoke< - typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), - Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0()); + return SerialDotInternal::template invoke( + X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), Y.data(), Y.stride_1(), Y.stride_0(), + dot.data(), dot.stride_0()); } }; @@ -262,31 +237,22 @@ struct SerialDot { template struct TeamDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(1) != dot.extent(0)) { @@ -300,48 +266,37 @@ struct TeamDot { #endif if (X.extent(1) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, Kokkos::ALL, 0), - Kokkos::subview(Y, Kokkos::ALL, 0)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, Kokkos::ALL, 0), Kokkos::subview(Y, Kokkos::ALL, 0)); return 0; } - return TeamDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0()); + return TeamDotInternal::template invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + dot.data(), dot.stride_0()); } }; template struct TeamDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != dot.extent(0)) { @@ -354,17 +309,15 @@ struct TeamDot { #endif if (X.extent(0) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, 0, Kokkos::ALL), Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), - Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0()); + return TeamDotInternal::template invoke( + member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), Y.data(), Y.stride_1(), Y.stride_0(), + dot.data(), dot.stride_0()); } }; @@ -375,31 +328,22 @@ struct TeamDot { template struct TeamVectorDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(1) != dot.extent(0)) { @@ -413,48 +357,37 @@ struct TeamVectorDot { #endif if (X.extent(1) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, Kokkos::ALL, 0), - Kokkos::subview(Y, Kokkos::ALL, 0)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, Kokkos::ALL, 0), Kokkos::subview(Y, Kokkos::ALL, 0)); return 0; } - return TeamVectorDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0()); + return TeamVectorDotInternal::template invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + dot.data(), dot.stride_0()); } }; template struct TeamVectorDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: XViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: YViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::rank == 1, - "KokkosBatched::dot: NormViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != dot.extent(0)) { @@ -467,17 +400,15 @@ struct TeamVectorDot { #endif if (X.extent(0) == 1) { - dot(0) = KokkosBlas::Experimental::dot( - member, Kokkos::subview(X, 0, Kokkos::ALL), - Kokkos::subview(Y, 0, Kokkos::ALL)); + dot(0) = + KokkosBlas::Experimental::dot(member, Kokkos::subview(X, 0, Kokkos::ALL), Kokkos::subview(Y, 0, Kokkos::ALL)); return 0; } - return TeamVectorDotInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename NormViewType::non_const_value_type>( - member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), - Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0()); + return TeamVectorDotInternal::template invoke( + member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(), Y.data(), Y.stride_1(), Y.stride_0(), + dot.data(), dot.stride_0()); } }; diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp index 49a7184e39..8ca3b09e59 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp @@ -26,38 +26,28 @@ namespace KokkosBatched { /// /// Serial Impl /// =========== -template -KOKKOS_INLINE_FUNCTION int SerialEigendecomposition::invoke( - const AViewType &A, const EViewType &er, const EViewType &ei, - const UViewType &UL, const UViewType &UR, const WViewType &W) { +template +KOKKOS_INLINE_FUNCTION int SerialEigendecomposition::invoke(const AViewType &A, const EViewType &er, + const EViewType &ei, const UViewType &UL, + const UViewType &UR, const WViewType &W) { /// view checking const int m = A.extent(0); assert(m == int(A.extent(1)) && "Eigendecomposition: A is not square"); - assert(m == int(er.extent(0)) && - "Eigendecomposition: Length of er does not match to A's dimension"); - assert(m == int(ei.extent(0)) && - "Eigendecomposition: Length of ei does not match to A's dimension"); - assert(m == int(UL.extent(0)) && - "Eigendecomposition: Length of UL does not match to A's dimension"); - assert(m == int(UL.extent(1)) && - "Eigendecomposition: Width of UL does not match to A's dimension"); - assert(m == int(UR.extent(0)) && - "Eigendecomposition: Length of UR does not match to A's dimension"); - assert(m == int(UR.extent(1)) && - "Eigendecomposition: Width of UR does not match to A's dimension"); + assert(m == int(er.extent(0)) && "Eigendecomposition: Length of er does not match to A's dimension"); + assert(m == int(ei.extent(0)) && "Eigendecomposition: Length of ei does not match to A's dimension"); + assert(m == int(UL.extent(0)) && "Eigendecomposition: Length of UL does not match to A's dimension"); + assert(m == int(UL.extent(1)) && "Eigendecomposition: Width of UL does not match to A's dimension"); + assert(m == int(UR.extent(0)) && "Eigendecomposition: Length of UR does not match to A's dimension"); + assert(m == int(UR.extent(1)) && "Eigendecomposition: Width of UR does not match to A's dimension"); // assert(int(W.extent(0)) >= int(2*m*m+5*m) && "Eigendecomposition: workspace // size is too small"); - assert(int(W.stride(0)) == int(1) && - "Eigendecomposition: Provided workspace is not contiguous"); + assert(int(W.stride(0)) == int(1) && "Eigendecomposition: Provided workspace is not contiguous"); /// static assert A,er,ei,UL,UR,W has the same value_type /// static assert all views have the same memory space return m ? SerialEigendecompositionInternal ::invoke( - A.extent(0), A.data(), A.stride(0), A.stride(1), er.data(), - er.stride(0), ei.data(), ei.stride(0), UL.data(), UL.stride(0), - UL.stride(1), UR.data(), UR.stride(0), UR.stride(1), W.data(), - W.extent(0)) + A.extent(0), A.data(), A.stride(0), A.stride(1), er.data(), er.stride(0), ei.data(), ei.stride(0), + UL.data(), UL.stride(0), UL.stride(1), UR.data(), UR.stride(0), UR.stride(1), W.data(), W.extent(0)) : 0; } diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index c857de19c2..b1cfb6ef25 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -61,11 +61,10 @@ struct SerialEigendecompositionInternal { /// [out]w, [in]wlen /// Workspace template - KOKKOS_INLINE_FUNCTION static int device_invoke( - const int m, RealType* A, const int as0, const int as1, RealType* er, - const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, - const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, - const int wlen) { + KOKKOS_INLINE_FUNCTION static int device_invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, + const int ers, RealType* ei, const int eis, RealType* UL, + const int uls0, const int uls1, RealType* UR, const int urs0, + const int urs1, RealType* w, const int wlen) { /// until debugging is done, comment out the code /// testing happens only for TPLs on host. static_assert(false, @@ -336,14 +335,10 @@ struct SerialEigendecompositionInternal { } template - inline static int host_invoke(const int m, RealType* A, const int as0, - const int as1, RealType* er, const int ers, - RealType* ei, const int eis, RealType* UL, - const int uls0, const int uls1, RealType* UR, - const int urs0, const int urs1, RealType* w, - const int wlen) { -#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + inline static int host_invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, const int ers, + RealType* ei, const int eis, RealType* UL, const int uls0, const int uls1, RealType* UR, + const int urs0, const int urs1, RealType* w, const int wlen) { +#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) int matrix_layout(0), lda(0), uls(0), urs(0); if (as0 == 1) { assert(uls0 == 1 && "UL is not column major"); @@ -365,33 +360,29 @@ struct SerialEigendecompositionInternal { } assert(matrix_layout != 0 && "Either stride of A is not unit"); if (std::is_same::value) { - LAPACKE_sgeev(matrix_layout, 'V', 'V', m, (float*)A, lda, (float*)er, - (float*)ei, (float*)UL, uls, (float*)UR, urs); + LAPACKE_sgeev(matrix_layout, 'V', 'V', m, (float*)A, lda, (float*)er, (float*)ei, (float*)UL, uls, (float*)UR, + urs); } else if (std::is_same::value) { - LAPACKE_dgeev(matrix_layout, 'V', 'V', m, (double*)A, lda, (double*)er, - (double*)ei, (double*)UL, uls, (double*)UR, urs); + LAPACKE_dgeev(matrix_layout, 'V', 'V', m, (double*)A, lda, (double*)er, (double*)ei, (double*)UL, uls, + (double*)UR, urs); } else { // no complex is needed for this moment assert(false && "complex type is not supported"); } #else - device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, - urs1, w, wlen); + device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen); #endif return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, RealType* A, const int as0, const int as1, RealType* er, - const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, - const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, - const int wlen) { - KOKKOS_IF_ON_HOST((host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, - uls1, UR, urs0, urs1, w, wlen);)) - KOKKOS_IF_ON_DEVICE((device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, - uls0, uls1, UR, urs0, urs1, w, wlen);)) + KOKKOS_INLINE_FUNCTION static int invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, + const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, + const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, + const int wlen) { + KOKKOS_IF_ON_HOST((host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) + KOKKOS_IF_ON_DEVICE((device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) return 0; } }; diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp index a05ee11965..97f68d63de 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp @@ -28,37 +28,28 @@ namespace KokkosBatched { /// ========= template -template -KOKKOS_INLINE_FUNCTION int TeamVectorEigendecomposition::invoke( - const MemberType &member, const AViewType &A, const EViewType &er, - const EViewType &ei, const UViewType &UL, const UViewType &UR, - const WViewType &W) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorEigendecomposition::invoke(const MemberType &member, + const AViewType &A, const EViewType &er, + const EViewType &ei, const UViewType &UL, + const UViewType &UR, const WViewType &W) { /// view checking const int m = A.extent(0); assert(m == A.extent(1) && "Eigendecomposition: A is not square"); - assert(m == er.extent(0) && - "Eigendecomposition: Length of er does not match to A's dimension"); - assert(m == ei.extent(0) && - "Eigendecomposition: Length of ei does not match to A's dimension"); - assert(m == UL.extent(0) && - "Eigendecomposition: Length of UL does not match to A's dimension"); - assert(m == UL.extent(1) && - "Eigendecomposition: Width of UL does not match to A's dimension"); - assert(m == UR.extent(0) && - "Eigendecomposition: Length of UR does not match to A's dimension"); - assert(m == UR.extent(1) && - "Eigendecomposition: Width of UR does not match to A's dimension"); + assert(m == er.extent(0) && "Eigendecomposition: Length of er does not match to A's dimension"); + assert(m == ei.extent(0) && "Eigendecomposition: Length of ei does not match to A's dimension"); + assert(m == UL.extent(0) && "Eigendecomposition: Length of UL does not match to A's dimension"); + assert(m == UL.extent(1) && "Eigendecomposition: Width of UL does not match to A's dimension"); + assert(m == UR.extent(0) && "Eigendecomposition: Length of UR does not match to A's dimension"); + assert(m == UR.extent(1) && "Eigendecomposition: Width of UR does not match to A's dimension"); // assert(W.extent(0) >= (2*m*m+5*m) && "Eigendecomposition: workspace size is // too small"); - assert(W.stride(0) == 1 && - "Eigendecomposition: Provided workspace is not contiguous"); + assert(W.stride(0) == 1 && "Eigendecomposition: Provided workspace is not contiguous"); - return m ? TeamVectorEigendecompositionInternal ::invoke( - member, A.extent(0), A.data(), A.stride(0), A.stride(1), - er.data(), er.stride(0), ei.data(), ei.stride(0), UL.data(), - UL.stride(0), UL.stride(1), UR.data(), UR.stride(0), - UR.stride(1), W.data(), W.extent(0)) + return m ? TeamVectorEigendecompositionInternal ::invoke(member, A.extent(0), A.data(), A.stride(0), A.stride(1), + er.data(), er.stride(0), ei.data(), ei.stride(0), UL.data(), + UL.stride(0), UL.stride(1), UR.data(), UR.stride(0), + UR.stride(1), W.data(), W.extent(0)) : 0; } diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp index 50324338ee..567bbd3ad5 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp @@ -40,11 +40,11 @@ namespace KokkosBatched { struct TeamVectorEigendecompositionInternal { template - KOKKOS_INLINE_FUNCTION static int device_invoke( - const MemberType &member, const int m, RealType *A, const int as0, - const int as1, RealType *er, const int ers, RealType *ei, const int eis, - RealType *UL, const int uls0, const int uls1, RealType *UR, - const int urs0, const int urs1, RealType *w, const int wlen) { + KOKKOS_INLINE_FUNCTION static int device_invoke(const MemberType &member, const int m, RealType *A, const int as0, + const int as1, RealType *er, const int ers, RealType *ei, + const int eis, RealType *UL, const int uls0, const int uls1, + RealType *UR, const int urs0, const int urs1, RealType *w, + const int wlen) { /// not yet implemented return 0; } @@ -74,13 +74,11 @@ struct TeamVectorEigendecompositionInternal { /// [out]w, [in]wlen /// Workspace template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, RealType *A, const int as0, - const int as1, RealType *er, const int ers, RealType *ei, const int eis, - RealType *UL, const int uls0, const int uls1, RealType *UR, - const int urs0, const int urs1, RealType *w, const int wlen) { - static_assert(false, - "TeamVector eigendecomposition is not implemented yet."); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, RealType *A, const int as0, + const int as1, RealType *er, const int ers, RealType *ei, const int eis, + RealType *UL, const int uls0, const int uls1, RealType *UR, const int urs0, + const int urs1, RealType *w, const int wlen) { + static_assert(false, "TeamVector eigendecomposition is not implemented yet."); /* // DO NOT USE // diff --git a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp index ae4cf10634..0ac8ed3859 100644 --- a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp @@ -61,11 +61,9 @@ struct SerialEigenvalueInternal { /// returns -1. template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ RealType *H, const int hs0, - const int hs1, + /* */ RealType *H, const int hs0, const int hs1, /* */ RealType *er, const int ers, - /* */ RealType *ei, const int eis, - const bool restart = false, + /* */ RealType *ei, const int eis, const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; @@ -94,8 +92,7 @@ struct SerialEigenvalueInternal { /// compute eigenvalues from the characteristic determinant equation bool is_complex; Kokkos::complex lambda1, lambda2; - SerialWilkinsonShiftInternal::invoke(H[0], H[hs1], H[hs0], H[hs], - &lambda1, &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(H[0], H[hs1], H[hs0], H[hs], &lambda1, &lambda2, &is_complex); er[0] = lambda1.real(); ei[0] = lambda1.imag(); er[1] = lambda2.real(); @@ -150,9 +147,8 @@ struct SerialEigenvalueInternal { bool is_complex; real_type *sub2x2 = H + (mend - 2) * hs; if (2 == mdiff) { - SerialWilkinsonShiftInternal::invoke( - sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, - &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, + &lambda2, &is_complex); sub2x2[hs0] = zero; /// eigenvalues are from wilkinson shift @@ -161,13 +157,10 @@ struct SerialEigenvalueInternal { er[(mbeg + 1) * ers] = lambda2.real(); ei[(mbeg + 1) * eis] = lambda2.imag(); } else { - SerialWilkinsonShiftInternal::invoke( - sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, - &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, + &lambda2, &is_complex); - SerialFrancisInternal::invoke(0, mdiff, mdiff, H + hs * mbeg, - hs0, hs1, lambda1, lambda2, - is_complex); + SerialFrancisInternal::invoke(0, mdiff, mdiff, H + hs * mbeg, hs0, hs1, lambda1, lambda2, is_complex); /* */ auto &val1 = *(sub2x2 + hs0); /* */ auto &val2 = *(sub2x2 - hs1); const auto abs_val1 = ats::abs(val1); @@ -217,18 +210,15 @@ struct SerialEigenvalueInternal { /// complex interface template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, - /* */ RealType *H, const int hs0, const int hs1, - /* */ Kokkos::complex *e, const int es, - const int max_iteration = 300, - const RealType user_tolerence = RealType(-1), - const bool restart = false) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, + /* */ RealType *H, const int hs0, const int hs1, + /* */ Kokkos::complex *e, const int es, + const int max_iteration = 300, const RealType user_tolerence = RealType(-1), + const bool restart = false) { RealType *er = (RealType *)e; RealType *ei = er + 1; const int two_es = 2 * es; - return invoke(m, H, hs0, hs1, er, two_es, ei, two_es, user_tolerence, - restart, max_iteration); + return invoke(m, H, hs0, hs1, er, two_es, ei, two_es, user_tolerence, restart, max_iteration); } }; diff --git a/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp index ffe911d688..42dc948014 100644 --- a/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp @@ -27,9 +27,7 @@ namespace KokkosBatched { /// ===================== struct SerialFindAmaxInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, - const ValueType *KOKKOS_RESTRICT A, - const int as0, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, /**/ IntType *KOKKOS_RESTRICT idx) { ValueType max_val(A[0]); IntType val_loc(0); @@ -50,14 +48,11 @@ struct SerialFindAmaxInternal { /// ======================== struct TeamVectorFindAmaxInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, - const ValueType *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A, const int as0, /**/ IntType *KOKKOS_RESTRICT idx) { if (m > 0) { - using reducer_value_type = - typename Kokkos::MaxLoc::value_type; + using reducer_value_type = typename Kokkos::MaxLoc::value_type; reducer_value_type value{}; Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( diff --git a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp index 21587f4481..e303cafd1f 100644 --- a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp @@ -32,12 +32,11 @@ namespace KokkosBatched { /// struct SerialFrancisInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int mbeg, const int mend, const int morg, - /* */ ValueType *HH, const int hs0, const int hs1, - const Kokkos::complex lambda1, - const Kokkos::complex lambda2, const bool is_complex, - /* */ Kokkos::pair *GG, const bool request_schur) { + KOKKOS_INLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, + const Kokkos::complex lambda1, + const Kokkos::complex lambda2, const bool is_complex, + /* */ Kokkos::pair *GG, const bool request_schur) { typedef ValueType value_type; const int hs = hs0 + hs1; @@ -73,25 +72,21 @@ struct SerialFrancisInternal { // this needs m>=3 // v = M e_1 = (H*H - 2 Re(lambda) H + |lambda|^2 I)e_1 value_type s, t; - const value_type h00 = H[0 * hs0 + 0 * hs1], h01 = H[0 * hs0 + 1 * hs1], - h10 = H[1 * hs0 + 0 * hs1], h11 = H[1 * hs0 + 1 * hs1], + const value_type h00 = H[0 * hs0 + 0 * hs1], h01 = H[0 * hs0 + 1 * hs1], h10 = H[1 * hs0 + 0 * hs1], + h11 = H[1 * hs0 + 1 * hs1], /* */ h21 = H[2 * hs0 + 1 * hs1]; if (is_complex) { s = 2 * lambda1.real(); t = lambda1.real() * lambda1.real() + lambda1.imag() * lambda1.imag(); } else { - const value_type val = H[(m - 1) * hs]; - const auto dist_lambda1 = - Kokkos::ArithTraits::abs(lambda1.real() - val); - const auto dist_lambda2 = - Kokkos::ArithTraits::abs(lambda2.real() - val); - const value_type lambda = - dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real(); - s = 2 * lambda; - t = lambda * lambda; + const value_type val = H[(m - 1) * hs]; + const auto dist_lambda1 = Kokkos::ArithTraits::abs(lambda1.real() - val); + const auto dist_lambda2 = Kokkos::ArithTraits::abs(lambda2.real() - val); + const value_type lambda = dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real(); + s = 2 * lambda; + t = lambda * lambda; } - v[0] = - h00 * h00 + h01 * h10 /* H^2 e_1 */ - s * h00 /* 2 Re(lambda) */ + t; + v[0] = h00 * h00 + h01 * h10 /* H^2 e_1 */ - s * h00 /* 2 Re(lambda) */ + t; v[1] = h10 * h00 + h11 * h10 /* */ - s * h10; v[2] = h21 * h10; } @@ -112,9 +107,8 @@ struct SerialFrancisInternal { const int mm = m < 4 ? m : 4, nn = m; value_type *Hs = H - mbeg_mult_hs0; - SerialApplyLeftRightGivensInternal ::invoke( - G[0], G[1], mm + mbeg, nn + mrst, H, H + hs0, H + 2 * hs0, Hs, - Hs + hs1, Hs + 2 * hs1, hs0, hs1); + SerialApplyLeftRightGivensInternal ::invoke(G[0], G[1], mm + mbeg, nn + mrst, H, H + hs0, H + 2 * hs0, Hs, + Hs + hs1, Hs + 2 * hs1, hs0, hs1); } /// 1. chase the bulge @@ -155,9 +149,8 @@ struct SerialFrancisInternal { value_type *a2 = a1 + hs1; value_type *a3 = a2 + hs1; - SerialApplyLeftRightGivensInternal ::invoke(G[0], G[1], mm + mbeg, - nn + mrst, a1t, a2t, a3t, a1, - a2, a3, hs0, hs1); + SerialApplyLeftRightGivensInternal ::invoke(G[0], G[1], mm + mbeg, nn + mrst, a1t, a2t, a3t, a1, a2, a3, hs0, + hs1); /// ----------------------------------------------------- H_part2x2.mergeToATL(H_part3x3); } @@ -181,8 +174,7 @@ struct SerialFrancisInternal { value_type *a2t = a1t + hs0; value_type *a1 = H_part3x3.A01 - mbeg_mult_hs0; value_type *a2 = a1 + hs1; - SerialApplyLeftRightGivensInternal ::invoke(G[0], mm + mbeg, nn + mrst, - a1t, a2t, a1, a2, hs0, hs1); + SerialApplyLeftRightGivensInternal ::invoke(G[0], mm + mbeg, nn + mrst, a1t, a2t, a1, a2, hs0, hs1); /// ----------------------------------------------------- H_part2x2.mergeToATL(H_part3x3); @@ -192,11 +184,10 @@ struct SerialFrancisInternal { } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const int mbeg, const int mend, const int morg, - /* */ ValueType *HH, const int hs0, const int hs1, - const Kokkos::complex lambda1, - const Kokkos::complex lambda2, const bool is_complex) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, + const Kokkos::complex lambda1, + const Kokkos::complex lambda2, const bool is_complex) { return invoke(mbeg, mend, morg, HH, hs0, hs1, lambda1, lambda2, is_complex, (Kokkos::pair *)NULL, false); } diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp index 6b3cec25da..82d6b1641b 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp @@ -36,44 +36,31 @@ namespace KokkosBatched { /// NT/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -83,80 +70,56 @@ SerialGemm -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } /// /// T/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(0); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -166,77 +129,56 @@ SerialGemm -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } /// /// NT/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -246,74 +188,56 @@ SerialGemm -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } /// /// T/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { typedef typename CViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = C.extent(0), n = C.extent(1), k = A.extent(0); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1 && C.stride_0() == 1) { - mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_1(), - (const double *)B.data(), B.stride_1(), beta, - (double *)C.data(), C.stride_1(), format, + mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_1(), + (const double *)B.data(), B.stride_1(), beta, (double *)C.data(), C.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1 && C.stride_1() == 1) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, - (const double *)A.data(), A.stride_0(), - (const double *)B.data(), B.stride_0(), beta, - (double *)C.data(), C.stride_0(), format, + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_TRANS, m, n, k, alpha, (const double *)A.data(), A.stride_0(), + (const double *)B.data(), B.stride_0(), beta, (double *)C.data(), C.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -323,33 +247,25 @@ SerialGemm::invoke( #endif template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemm::invoke( - const ScalarType alpha, const AViewType &A, const BViewType &B, - const ScalarType beta, const CViewType &C) { +template +KOKKOS_INLINE_FUNCTION int SerialGemm::invoke( + const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return SerialGemmInternal::invoke( - C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), - C.stride_0(), C.stride_1()); + return SerialGemmInternal::invoke(C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp index 43197f1da3..eaa5b67ffa 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp @@ -34,21 +34,18 @@ namespace KokkosBatched { template struct SerialGemmInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const int k, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, + const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); }; template <> template KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( - const int m, const int n, const int k, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - const ScalarType beta, + const int m, const int n, const int k, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) @@ -65,8 +62,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( ValueType *KOKKOS_RESTRICT pC = C; for (int p = 0; p < k; ++p) { - const ValueType *KOKKOS_RESTRICT pA = A + p * as1, - *KOKKOS_RESTRICT pB = B + p * bs0; + const ValueType *KOKKOS_RESTRICT pA = A + p * as1, *KOKKOS_RESTRICT pB = B + p * bs0; for (int i = 0; i < m; ++i) { const ValueType tA(alpha * pA[i * as0]); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -82,10 +78,8 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( - const int m, const int n, const int k, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, - const ScalarType beta, + const int m, const int n, const int k, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) @@ -105,17 +99,14 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( const ValueType alpha_value(alpha); InnerGemmFixC inner(as0, as1, bs0, bs1, cs0, cs1); - auto gemm = [&](const int ib, const int jb, const int pb, - const ValueType *KOKKOS_RESTRICT AA, + auto gemm = [&](const int ib, const int jb, const int pb, const ValueType *KOKKOS_RESTRICT AA, const ValueType *KOKKOS_RESTRICT BB, /**/ ValueType *KOKKOS_RESTRICT CC) { const int mb = mbAlgo, nb = nbAlgo; for (int i = 0; i < ib; i += mb) for (int j = 0; j < jb; j += nb) - inner.serial_invoke(alpha_value, AA + i * as0, BB + j * bs1, - (i + mb) > ib ? (ib - i) : mb, - (j + nb) > jb ? (jb - j) : nb, pb, - CC + i * cs0 + j * cs1); + inner.serial_invoke(alpha_value, AA + i * as0, BB + j * bs1, (i + mb) > ib ? (ib - i) : mb, + (j + nb) > jb ? (jb - j) : nb, pb, CC + i * cs0 + j * cs1); }; const bool is_small = true; //(m*n*k <= 64*64*64); diff --git a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp index aedfb9f662..64e65d62d8 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp @@ -40,19 +40,15 @@ namespace KokkosBatched { /// template -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_0(), B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -61,19 +57,15 @@ struct TeamVectorGemm -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_0(), B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -82,19 +74,15 @@ struct TeamVectorGemm -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_1(), B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -103,19 +91,15 @@ struct TeamVectorGemm -struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamVectorGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) return TeamVectorGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_1(), B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index 7e40ec4415..8ad7d570df 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -31,21 +31,18 @@ namespace KokkosBatched { template struct TeamVectorGemmInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, - const int bs1, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const int bs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, +KOKKOS_INLINE_FUNCTION int TeamVectorGemmInternal::invoke( + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -54,11 +51,9 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, - cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, - cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -67,15 +62,13 @@ TeamVectorGemmInternal::invoke( Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { const ValueType *KOKKOS_RESTRICT pA = A + i * as0; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { - const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; - - ValueType c = ValueType(0); - for (int p = 0; p < k; ++p) - c += pA[p * as1] * pB[p * bs0]; - C[i * cs0 + j * cs1] += alpha * c; - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; + + ValueType c = ValueType(0); + for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0]; + C[i * cs0 + j * cs1] += alpha * c; + }); }); } return 0; @@ -83,11 +76,9 @@ TeamVectorGemmInternal::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, +KOKKOS_INLINE_FUNCTION int TeamVectorGemmInternal::invoke( + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -96,11 +87,9 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, - cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, - cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -109,16 +98,13 @@ TeamVectorGemmInternal::invoke( Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { const ValueType *KOKKOS_RESTRICT pA = A + i * as0; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), [&](const int &j) { - const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; - - ValueType c = ValueType(0); - for (int p = 0; p < k; ++p) - c += Kokkos::ArithTraits::conj(pA[p * as1]) * - pB[p * bs0]; - C[i * cs0 + j * cs1] += alpha * c; - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { + const ValueType *KOKKOS_RESTRICT pB = B + j * bs1; + + ValueType c = ValueType(0); + for (int p = 0; p < k; ++p) c += Kokkos::ArithTraits::conj(pA[p * as1]) * pB[p * bs0]; + C[i * cs0 + j * cs1] += alpha * c; + }); }); } return 0; diff --git a/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp index 647ffbdb26..0a9fb87b9e 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp @@ -40,36 +40,28 @@ namespace KokkosBatched { /// template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -78,36 +70,28 @@ struct TeamGemm -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), + B.stride_1(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -116,36 +100,28 @@ struct TeamGemm -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; @@ -154,36 +130,28 @@ struct TeamGemm -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; template -struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { +struct TeamGemm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // C = beta C + alpha A B // C (m x n), A(m x k), B(k x n) - return TeamGemmInternal::invoke( - member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, - C.data(), C.stride_0(), C.stride_1()); + return TeamGemmInternal::invoke(member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index 988a4e5da2..1b77a25991 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -34,20 +34,18 @@ namespace KokkosBatched { template struct TeamGemmInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, - const int bs1, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const int bs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1); }; template <> template KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -58,25 +56,22 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( if (beta == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, - cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; if (beta != one) member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { - // assume layout right for batched computation - const int i = ij / n, j = ij % n; - const ValueType *KOKKOS_RESTRICT pA = A + i * as0, - *KOKKOS_RESTRICT pB = B + j * bs1; - - ValueType c = ValueType(0); - for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0]; - C[i * cs0 + j * cs1] += alpha * c; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + // assume layout right for batched computation + const int i = ij / n, j = ij % n; + const ValueType *KOKKOS_RESTRICT pA = A + i * as0, *KOKKOS_RESTRICT pB = B + j * bs1; + + ValueType c = ValueType(0); + for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0]; + C[i * cs0 + j * cs1] += alpha * c; + }); } return 0; } @@ -84,9 +79,8 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( - const MemberType &member, const int m, const int n, const int k, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, + const MemberType &member, const int m, const int n, const int k, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) { // C = beta C + alpha A B @@ -100,8 +94,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( if (beta == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, - cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -111,31 +104,27 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( /// /// GPU case: team size is large and blocksize (mb,nb) is small InnerGemmFixC inner(as0, as1, bs0, bs1, cs0, cs1); - auto gemm = [&](const int ib, const int jb, const int pb, - const ValueType *KOKKOS_RESTRICT AA, + auto gemm = [&](const int ib, const int jb, const int pb, const ValueType *KOKKOS_RESTRICT AA, const ValueType *KOKKOS_RESTRICT BB, /**/ ValueType *KOKKOS_RESTRICT CC) { // Made this non-const in order to WORKAROUND issue #349 - int mb = mbAlgo, mp = (ib % mb), mq = (ib / mb) + (mp > 0), nb = nbAlgo, - np = (jb % nb), nq = (jb / nb) + (np > 0); + int mb = mbAlgo, mp = (ib % mb), mq = (ib / mb) + (mp > 0), nb = nbAlgo, np = (jb % nb), + nq = (jb / nb) + (np > 0); // square tiling - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, mq * nq), [&](const int &ij) { - int i, j; - // note: the condition is constexpr - if (KokkosKernels::Impl::kk_is_gpu_exec_space< - typename MemberType::execution_space>()) { - i = ij % mq * mb; - j = ij / mq * nb; - } else { - i = ij / nq * mb; - j = ij % nq * nb; - } - inner.serial_invoke( - alpha, AA + i * as0, BB + j * bs1, (i + mb) > ib ? mp : mb, - (j + nb) > jb ? np : nb, pb, CC + i * cs0 + j * cs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, mq * nq), [&](const int &ij) { + int i, j; + // note: the condition is constexpr + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij % mq * mb; + j = ij / mq * nb; + } else { + i = ij / nq * mb; + j = ij % nq * nb; + } + inner.serial_invoke(alpha, AA + i * as0, BB + j * bs1, (i + mb) > ib ? mp : mb, (j + nb) > jb ? np : nb, pb, + CC + i * cs0 + j * cs1); + }); }; const bool is_small = true; //(m*n*k <= 64*64*64); diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index a0b948bb13..4f54bf7f31 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -41,43 +41,30 @@ namespace KokkosBatched { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); if (A.extent(0) == 1) { - KokkosBlas::TeamVectorGemv< - MemberType, Trans::NoTranspose, - Algo::Gemv::Unblocked>::invoke(member, alpha, - Kokkos::subview(A, 0, Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview(x, 0, Kokkos::ALL), - beta, - Kokkos::subview(y, 0, Kokkos::ALL)); + KokkosBlas::TeamVectorGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); return 0; } return TeamVectorGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), - A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), A.stride_0(), A.stride_1(), A.stride_2(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " @@ -94,32 +81,24 @@ struct TeamVectorGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); return TeamVectorGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_2(), A.stride_1(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index 0ffc60ec90..8d9676b223 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -30,30 +30,24 @@ namespace KokkosBatched { /// ==================== template struct TeamVectorGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const int /*N*/, const int /*m*/, - const int /*n*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, const int /*as2*/, - const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/, - const int /*xs1*/, const ScalarType /*beta*/, - /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/, - const int /*ys1*/) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const int /*N*/, const int /*m*/, + const int /*n*/, const ScalarType /*alpha*/, + const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, const int /*as1*/, + const int /*as2*/, const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/, + const int /*xs1*/, const ScalarType /*beta*/, + /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/, + const int /*ys1*/) { assert(false && "Error: encounter dummy impl"); return 0; } }; template <> -template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemvInternal::invoke( - const MemberType &member, const int N, const int m, const int n, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, +template +KOKKOS_INLINE_FUNCTION int TeamVectorGemvInternal::invoke( + const MemberType &member, const int N, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, const int xs0, const int xs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { const ScalarType one(1.0), zero(0.0); @@ -64,37 +58,32 @@ TeamVectorGemvInternal::invoke( if (beta == zero) // TODO: KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, // ys0); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] = zero; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); else if (beta != one) // TODO: KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, // y, ys0); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] *= beta; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; if (beta != one) member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - ValueType t(0); - getIndices(iTemp, m, N, iRow, iMatrix); - for (int i = 0; i < n; ++i) - t += A[as0 * iMatrix + as1 * iRow + as2 * i] * - X[xs0 * iMatrix + xs1 * i]; - Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) t += A[as0 * iMatrix + as1 * iRow + as2 * i] * X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); } return 0; } diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 48627aaf30..16f12529d4 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -42,11 +42,9 @@ namespace KokkosBatched { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && "Batched TeamGemv requires rank-3 A matrix (use " @@ -58,34 +56,23 @@ struct TeamGemv { } if (A.extent(0) == 1) { - KokkosBlas::TeamGemv< - MemberType, Trans::NoTranspose, - Algo::Gemv::Unblocked>::invoke(member, alpha, - Kokkos::subview(A, 0, Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview(x, 0, Kokkos::ALL), - beta, - Kokkos::subview(y, 0, Kokkos::ALL)); + KokkosBlas::TeamGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); return 0; } return TeamGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), - A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), A.stride_0(), A.stride_1(), A.stride_2(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { /* if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && @@ -108,11 +95,9 @@ struct TeamGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && "Batched TeamGemv requires rank-3 A matrix (use " @@ -123,31 +108,23 @@ struct TeamGemv { "KokkosBlas::TeamGemv for regular rank-2 matrix)"); } if (A.extent(0) == 1) { - KokkosBlas:: - TeamGemv::invoke( - member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(x, 0, Kokkos::ALL), beta, - Kokkos::subview(y, 0, Kokkos::ALL)); + KokkosBlas::TeamGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); return 0; } return TeamGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_2(), A.stride_1(), + x.data(), x.stride_0(), x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { /* if constexpr (Kokkos::is_dyn_rank_view::value) { assert(A.rank_dynamic() == 3 && diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 77629c678f..8f63e24b27 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -20,9 +20,9 @@ #include "KokkosBatched_Util.hpp" -//#include "KokkosBlas1_set_impl.hpp" -//#include "KokkosBlas1_team_scal_impl.hpp" -//#include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" +// #include "KokkosBlas1_set_impl.hpp" +// #include "KokkosBlas1_team_scal_impl.hpp" +// #include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" namespace KokkosBatched { @@ -31,23 +31,19 @@ namespace KokkosBatched { /// ==================== template struct TeamGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int N, const int m, const int n, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const int as2, const ValueType *KOKKOS_RESTRICT x, - const int xs0, const int xs1, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT x, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1); }; template <> -template +template KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, const int N, const int m, const int n, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, + const MemberType &member, const int N, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, const int xs0, const int xs1, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { const ScalarType one(1.0), zero(0.0); @@ -56,35 +52,30 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( // y_l (m), A_l(m x n), B_l(n) if (beta == zero) - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] = zero; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); else if (beta != one) - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - getIndices(iTemp, m, N, iRow, iMatrix); - Y[ys0 * iMatrix + ys1 * iRow] *= beta; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; if (beta != one) member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), - [&](const int &iTemp) { - int iRow, iMatrix; - ValueType t(0); - getIndices(iTemp, m, N, iRow, iMatrix); - for (int i = 0; i < n; ++i) - t += A[as0 * iMatrix + as1 * iRow + as2 * i] * - X[xs0 * iMatrix + xs1 * i]; - Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) t += A[as0 * iMatrix + as1 * iRow + as2 * i] * X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); } return 0; } diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 4c9f54d037..ba18cbafd7 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -26,40 +26,33 @@ namespace KokkosBatched { struct SerialStaticPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, - const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, - const VectorType2 tmp_v_2); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2); }; template struct TeamStaticPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); }; template struct TeamVectorStaticPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); }; -template -KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke( - const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, - const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, - const VectorType2 tmp_v_2) { +template +KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke(const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, + const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { using value_type = typename MatrixType1::non_const_value_type; const size_t n = A.extent(0); @@ -139,15 +132,14 @@ KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { - using value_type = typename MatrixType1::non_const_value_type; - using reducer_value_type = - typename Kokkos::MaxLoc::value_type; +template +KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke(const MemberType &member, const MatrixType1 A, + const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = typename Kokkos::MaxLoc::value_type; // This implementation follows the strategy of SerialStaticPivoting but uses // an extra level of parallelism. @@ -222,15 +214,14 @@ KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( - const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, - const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, - const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { - using value_type = typename MatrixType1::non_const_value_type; - using reducer_value_type = - typename Kokkos::MaxLoc::value_type; +template +KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke(const MemberType &member, const MatrixType1 A, + const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = typename Kokkos::MaxLoc::value_type; // This implementation follows the strategy of SerialStaticPivoting but uses // two extra levels of parallelism. @@ -265,8 +256,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( }); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A(i, j) *= D2(j); }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A(i, j) *= D2(j); }); }); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { @@ -283,8 +273,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( }, reducer_value); D1_i = 1. / value.val; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A(i, j) *= D1_i; }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A(i, j) *= D1_i; }); Y(i) *= D1_i; }); @@ -318,18 +307,15 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { - PDAD(col_index, j) = A(row_index, j); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), + [&](const int &j) { PDAD(col_index, j) = A(row_index, j); }); PDY(col_index) = Y(row_index); } return 0; } template -KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, - const VectorType2 D, - const VectorType3 DX) { +KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, const VectorType2 D, const VectorType3 DX) { const size_t n = X.extent(0); for (size_t i = 0; i < n; ++i) { @@ -337,28 +323,20 @@ KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, } } -template -KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member, - const VectorType1 X, - const VectorType2 D, +template +KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member, const VectorType1 X, const VectorType2 D, const VectorType3 DX) { const size_t n = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), - [&](const size_t &i) { DX(i) = D(i) * X(i); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const size_t &i) { DX(i) = D(i) * X(i); }); } -template -KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, - const VectorType1 X, - const VectorType2 D, +template +KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, const VectorType1 X, const VectorType2 D, const VectorType3 DX) { const size_t n = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), - [&](const size_t &i) { DX(i) = D(i) * X(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const size_t &i) { DX(i) = D(i) * X(i); }); } /// @@ -367,23 +345,15 @@ KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, template <> struct SerialGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const XVectorType X, - const YVectorType Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const XVectorType X, const YVectorType Y, const MatrixType tmp) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(XVectorType::rank == 1, - "KokkosBatched::gesv: XVectorType must have rank 1."); - static_assert(YVectorType::rank == 1, - "KokkosBatched::gesv: YVectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(XVectorType::rank == 1, "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -392,18 +362,15 @@ struct SerialGesv { "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " "%d x %d, tmp (note: its second dimension should be the second " "dimension of A + 4): %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), - (int)tmp.extent(1)); + (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), (int)tmp.extent(1)); return 1; } - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -416,8 +383,7 @@ struct SerialGesv { auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == - 1) { + if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); @@ -427,14 +393,12 @@ struct SerialGesv { int r_val = SerialLU::invoke(PDAD); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, PDAD, PDY); + r_val = SerialTrsm::invoke( + 1.0, PDAD, PDY); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, PDAD, PDY); + r_val = SerialTrsm::invoke( + 1.0, PDAD, PDY); if (r_val == 0) SerialHadamard1D(PDY, D2, X); return r_val; @@ -444,33 +408,23 @@ struct SerialGesv { template <> struct SerialGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const XVectorType X, - const YVectorType Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const XVectorType X, const YVectorType Y, const MatrixType /*tmp*/) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(XVectorType::rank == 1, - "KokkosBatched::gesv: XVectorType must have rank 1."); - static_assert(YVectorType::rank == 1, - "KokkosBatched::gesv: YVectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(XVectorType::rank == 1, "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -480,14 +434,12 @@ struct SerialGesv { if (r_val == 0) r_val = SerialCopy::invoke(Y, X); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, A, X); + r_val = SerialTrsm::invoke( + 1.0, A, X); if (r_val == 0) - r_val = - SerialTrsm::invoke(1.0, A, X); + r_val = SerialTrsm::invoke( + 1.0, A, X); return r_val; } @@ -500,34 +452,25 @@ struct SerialGesv { template struct TeamGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif - using ScratchPadMatrixViewType = Kokkos::View< - typename MatrixType::non_const_value_type **, - typename MatrixType::execution_space::scratch_memory_space>; + using ScratchPadMatrixViewType = Kokkos::View; const int n = A.extent(0); @@ -538,8 +481,7 @@ struct TeamGesv { auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, - tmp_v_1, tmp_v_2) == 1) { + if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); @@ -547,22 +489,18 @@ struct TeamGesv { } member.team_barrier(); - int r_val = - TeamLU::invoke(member, PDAD); + int r_val = TeamLU::invoke(member, PDAD); member.team_barrier(); if (r_val == 0) { - r_val = TeamTrsm::invoke(member, 1.0, - PDAD, PDY); + r_val = TeamTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } if (r_val == 0) { - r_val = - TeamTrsm::invoke(member, 1.0, - PDAD, PDY); + r_val = TeamTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } @@ -578,28 +516,20 @@ struct TeamGesv { template struct TeamGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -613,15 +543,14 @@ struct TeamGesv { } if (r_val == 0) { - TeamTrsm::invoke(member, 1.0, A, X); + TeamTrsm::invoke( + member, 1.0, A, X); member.team_barrier(); } if (r_val == 0) { - TeamTrsm::invoke(member, 1.0, A, - X); + TeamTrsm::invoke( + member, 1.0, A, X); member.team_barrier(); } @@ -636,34 +565,25 @@ struct TeamGesv { template struct TeamVectorGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif - using ScratchPadMatrixViewType = Kokkos::View< - typename MatrixType::non_const_value_type **, - typename MatrixType::execution_space::scratch_memory_space>; + using ScratchPadMatrixViewType = Kokkos::View; const int n = A.extent(0); @@ -674,8 +594,7 @@ struct TeamVectorGesv { auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - if (TeamVectorStaticPivoting::invoke( - member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { + if (TeamVectorStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { Kokkos::printf( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); @@ -684,22 +603,18 @@ struct TeamVectorGesv { member.team_barrier(); - int r_val = - TeamLU::invoke(member, PDAD); + int r_val = TeamLU::invoke(member, PDAD); member.team_barrier(); if (r_val == 0) { - TeamVectorTrsm::invoke(member, 1.0, - PDAD, PDY); + TeamVectorTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } if (r_val == 0) { - TeamVectorTrsm::invoke(member, - 1.0, PDAD, - PDY); + TeamVectorTrsm::invoke(member, 1.0, PDAD, PDY); member.team_barrier(); } @@ -715,28 +630,20 @@ struct TeamVectorGesv { template struct TeamVectorGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { Kokkos::printf( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), - (int)Y.extent(0)); + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); return 1; } #endif @@ -750,16 +657,14 @@ struct TeamVectorGesv { } if (r_val == 0) { - TeamVectorTrsm::invoke(member, 1.0, - A, X); + TeamVectorTrsm::invoke(member, 1.0, A, X); member.team_barrier(); } if (r_val == 0) { - TeamVectorTrsm::invoke(member, - 1.0, A, X); + TeamVectorTrsm::invoke(member, 1.0, A, X); member.team_barrier(); } diff --git a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp index 4d80c6a250..963862661b 100644 --- a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp @@ -30,10 +30,9 @@ namespace KokkosBatched { /// struct SerialGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const ValueType chi1, const ValueType chi2, - /* */ Kokkos::pair* G, - /* */ ValueType* chi1_new) { + KOKKOS_INLINE_FUNCTION static int invoke(const ValueType chi1, const ValueType chi2, + /* */ Kokkos::pair* G, + /* */ ValueType* chi1_new) { typedef ValueType value_type; const value_type zero(0), one(1); /// compute G = [ gamma -sigma; @@ -58,9 +57,7 @@ struct SerialGivensInternal { cs = chi1 / r; sn = chi2 / r; - if (Kokkos::ArithTraits::abs(chi1) > - Kokkos::ArithTraits::abs(chi2) && - cs < zero) { + if (Kokkos::ArithTraits::abs(chi1) > Kokkos::ArithTraits::abs(chi2) && cs < zero) { cs = -cs; sn = -sn; r = -r; diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index 90b89e4ad1..658acd6b60 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -27,16 +27,12 @@ namespace KokkosBatched { /// ==================== struct SerialHadamardProductInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, const int xs1, - const ValueType* KOKKOS_RESTRICT Y, - const int ys0, const int ys1, - /* */ ValueType* KOKKOS_RESTRICT V, - const int vs0, const int vs1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ValueType* KOKKOS_RESTRICT X, const int xs0, + const int xs1, const ValueType* KOKKOS_RESTRICT Y, const int ys0, + const int ys1, + /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) - V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; + for (int j = 0; j < n; ++j) V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; return 0; } @@ -47,17 +43,15 @@ struct SerialHadamardProductInternal { /// ==================== struct TeamHadamardProductInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, - /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, + /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; + }); // member.team_barrier(); return 0; } @@ -68,17 +62,15 @@ struct TeamHadamardProductInternal { /// ======================== struct TeamVectorHadamardProductInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, - /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; - }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1, + /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1]; + }); // member.team_barrier(); return 0; } @@ -88,25 +80,14 @@ struct TeamVectorHadamardProductInternal { /// Serial Impl /// =========== template -KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, - const YViewType& Y, - const VViewType& V) { +KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, const YViewType& Y, const VViewType& V) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::rank == 2, - "KokkosBatched::HadamardProduct: VViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -127,10 +108,9 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, } #endif - return SerialHadamardProductInternal::template invoke< - typename XViewType::non_const_value_type>( - X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), - Y.stride_0(), Y.stride_1(), V.data(), V.stride_0(), V.stride_1()); + return SerialHadamardProductInternal::template invoke( + X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), V.data(), + V.stride_0(), V.stride_1()); } /// @@ -139,25 +119,15 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, template template -KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( - const MemberType& member, const XViewType& X, const YViewType& Y, - const VViewType& V) { +KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke(const MemberType& member, const XViewType& X, + const YViewType& Y, const VViewType& V) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::rank == 2, - "KokkosBatched::HadamardProduct: VViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -178,12 +148,10 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( } #endif - return TeamHadamardProductInternal::template invoke< - MemberType, typename XViewType::non_const_value_type, - typename XViewType::array_layout>(member, X.extent(0), X.extent(1), - X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), - V.data(), V.stride_0(), V.stride_1()); + return TeamHadamardProductInternal::template invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + V.data(), V.stride_0(), V.stride_1()); } /// @@ -192,25 +160,15 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( template template -KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( - const MemberType& member, const XViewType& X, const YViewType& Y, - const VViewType& V) { +KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke(const MemberType& member, const XViewType& X, + const YViewType& Y, const VViewType& V) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); - static_assert( - Kokkos::is_view::value, - "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::rank == 2, - "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::rank == 2, - "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::rank == 2, - "KokkosBatched::HadamardProduct: VViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -231,12 +189,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( } #endif - return TeamVectorHadamardProductInternal::invoke< - MemberType, typename XViewType::non_const_value_type, - typename XViewType::array_layout>(member, X.extent(0), X.extent(1), - X.data(), X.stride_0(), X.stride_1(), - Y.data(), Y.stride_0(), Y.stride_1(), - V.data(), V.stride_0(), V.stride_1()); + return TeamVectorHadamardProductInternal::invoke( + member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1(), + V.data(), V.stride_0(), V.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp index 023257c8ed..8db5d40a98 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp @@ -34,13 +34,10 @@ namespace KokkosBatched { struct SerialHessenbergFormQInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int k, - /* */ ValueType* A, const int as0, - const int as1, + /* */ ValueType* A, const int as0, const int as1, /* */ ValueType* t, const int ts, - /* */ ValueType* Q, const int qs0, - const int qs1, - /* */ ValueType* w, - const bool is_Q_zero = false) { + /* */ ValueType* Q, const int qs0, const int qs1, + /* */ ValueType* w, const bool is_Q_zero = false) { typedef ValueType value_type; /// Given a matrix A that includes Hessenberg factorization @@ -52,14 +49,12 @@ struct SerialHessenbergFormQInternal { /// B is m x m // set identity if (is_Q_zero) - KokkosBlas::Impl::SerialSetInternal::invoke(m, value_type(1), Q, - qs0 + qs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); else SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); - return SerialApplyQ_LeftNoTransForwardInternal ::invoke( - m - 1, m - 1, k - 1, A + as0, as0, as1, t, ts, Q + qs0 + qs1, qs1, qs0, - w); + return SerialApplyQ_LeftNoTransForwardInternal ::invoke(m - 1, m - 1, k - 1, A + as0, as0, as1, t, ts, + Q + qs0 + qs1, qs1, qs0, w); } }; diff --git a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp index 3d2b75e64d..3815a9e18e 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp @@ -32,10 +32,9 @@ namespace KokkosBatched { /// struct SerialHessenbergQR_WithShiftInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int mbeg, const int mend, const int morg, - /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift, - /* */ Kokkos::pair *GG, const bool request_schur) { + KOKKOS_INLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift, + /* */ Kokkos::pair *GG, const bool request_schur) { typedef ValueType value_type; // typedef Kokkos::ArithTraits ats; @@ -79,13 +78,11 @@ struct SerialHessenbergQR_WithShiftInternal { // apply G' from left G.second = -G.second; // transpose G const int nn = m; - SerialApplyLeftGivensInternal::invoke(G, nn + (morg - mend), h11, hs1, - h21, hs1); + SerialApplyLeftGivensInternal::invoke(G, nn + (morg - mend), h11, hs1, h21, hs1); // apply (G')' from right const int mm = m < 3 ? m : 3; - SerialApplyRightGivensInternal::invoke(G, mm + mbeg, h11 - mbeg_mult_hs0, - hs0, h12 - mbeg_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mm + mbeg, h11 - mbeg_mult_hs0, hs0, h12 - mbeg_mult_hs0, hs0); } /// 1. chase the bulge @@ -112,13 +109,11 @@ struct SerialHessenbergQR_WithShiftInternal { G.second = -G.second; // transpose G const int nn = m - m_htl; - SerialApplyLeftGivensInternal::invoke( - G, nn + (morg - mend), H_part3x3.A11, hs1, H_part3x3.A21, hs1); + SerialApplyLeftGivensInternal::invoke(G, nn + (morg - mend), H_part3x3.A11, hs1, H_part3x3.A21, hs1); const int mtmp = m_htl + 3, mm = mtmp < m ? mtmp : m; - SerialApplyRightGivensInternal::invoke( - G, mm + mbeg, H_part3x3.A01 - mbeg_mult_hs0, hs0, - H_part3x3.A02 - mbeg_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mm + mbeg, H_part3x3.A01 - mbeg_mult_hs0, hs0, + H_part3x3.A02 - mbeg_mult_hs0, hs0); /// ----------------------------------------------------- H_part2x2.mergeToATL(H_part3x3); } @@ -126,13 +121,10 @@ struct SerialHessenbergQR_WithShiftInternal { } template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int mbeg, const int mend, - const int morg, - /* */ ValueType *HH, - const int hs0, const int hs1, + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift) { - return invoke(mbeg, mend, morg, HH, hs0, hs1, shift, - (Kokkos::pair *)NULL, false); + return invoke(mbeg, mend, morg, HH, hs0, hs1, shift, (Kokkos::pair *)NULL, false); } }; diff --git a/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp index f12115e4de..44c5b44373 100644 --- a/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp @@ -34,8 +34,7 @@ struct SerialHessenbergInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, /* */ ValueType *w) { typedef ValueType value_type; @@ -76,25 +75,22 @@ struct SerialHessenbergInternal { // perform householder transformation const int m_A22_b = m_A22 - 1; - SerialLeftHouseholderInternal::invoke(m_A22_b, A21_part2x1.AT, - A21_part2x1.AB, as0, tau); + SerialLeftHouseholderInternal::invoke(m_A22_b, A21_part2x1.AT, A21_part2x1.AB, as0, tau); // partition A22 into 2x1 A22_part2x1.partWithAT(A_part3x3.A22, m_A22, 1); // left apply householder to partitioned A22 - SerialApplyLeftHouseholderInternal::invoke( - m_A22_b, n_A22, tau, A21_part2x1.AB, as0, A22_part2x1.AT, as1, - A22_part2x1.AB, as0, as1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A22_b, n_A22, tau, A21_part2x1.AB, as0, A22_part2x1.AT, as1, + A22_part2x1.AB, as0, as1, w); // partition A*2 column into 1x2 A2_part1x2.partWithAL(A_part3x3.A02, n_A22, 1); // right apply householder to A*2 colums const int n_A22_r = n_A22 - 1; - SerialApplyRightHouseholderInternal::invoke( - m, n_A22_r, tau, A21_part2x1.AB, as0, A2_part1x2.AL, as0, - A2_part1x2.AR, as0, as1, w); + SerialApplyRightHouseholderInternal::invoke(m, n_A22_r, tau, A21_part2x1.AB, as0, A2_part1x2.AL, as0, + A2_part1x2.AR, as0, as1, w); } /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp index 971fb36081..7e814646a2 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp @@ -67,9 +67,8 @@ namespace Impl { /// ScalarType, AViewType, BViewType, CViewType> /// (handle, alpha, A, B, beta, C).invoke(); // clang-format on -template +template class BatchedArmplGemm { private: HandleType *const __handle; @@ -107,26 +106,21 @@ class BatchedArmplGemm { for (int ib = 0; ib < __nbatch; ++ib) { for (int i = 0; i < __ninter; ++i) { auto svA = - subview_wrapper(__A, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__A, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); auto svB = - subview_wrapper(__B, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__B, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); auto svC = - subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); - auto info = armpl_dge_interleave( - __ninter, i, __Am, __An, svA.data(), svA.stride(0), svA.stride(1), - &__Adp[__Abstrd * ib], __Aistrd, __Ajstrd); + auto info = armpl_dge_interleave(__ninter, i, __Am, __An, svA.data(), svA.stride(0), svA.stride(1), + &__Adp[__Abstrd * ib], __Aistrd, __Ajstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; os << "armpl_dge_interleave(A) returned:" << info << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - info = armpl_dge_interleave(__ninter, i, __Bm, __Bn, svB.data(), - svB.stride(0), svB.stride(1), + info = armpl_dge_interleave(__ninter, i, __Bm, __Bn, svB.data(), svB.stride(0), svB.stride(1), &__Bdp[__Bbstrd * ib], __Bistrd, __Bjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; @@ -134,8 +128,7 @@ class BatchedArmplGemm { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - info = armpl_dge_interleave(__ninter, i, __Cm, __Cn, svC.data(), - svC.stride(0), svC.stride(1), + info = armpl_dge_interleave(__ninter, i, __Cm, __Cn, svC.data(), svC.stride(0), svC.stride(1), &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; @@ -152,12 +145,10 @@ class BatchedArmplGemm { for (int ib = 0; ib < __nbatch; ++ib) { for (int i = 0; i < __ninter; ++i) { auto svC = - subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), - Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); + subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(), Kokkos::ALL(), __batch_layout_tag, __no_trans_tag); - auto info = armpl_dge_deinterleave( - __ninter, i, __Cm, __Cn, svC.data(), svC.stride(0), svC.stride(1), - &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd); + auto info = armpl_dge_deinterleave(__ninter, i, __Cm, __Cn, svC.data(), svC.stride(0), svC.stride(1), + &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; os << "armpl_dge_deinterleave returned:" << info << std::endl; @@ -170,11 +161,10 @@ class BatchedArmplGemm { template std::enable_if_t::value, void> __run(T &) { - auto info = armpl_dgemm_interleave_batch( - __ninter, __nbatch, __transa, __transb, __Cm, __Cn, - std::is_same::value ? __An : __Am, - __alpha, __Adp, __Abstrd, __Aistrd, __Ajstrd, __Bdp, __Bbstrd, __Bistrd, - __Bjstrd, __beta, __Cdp, __Cbstrd, __Cistrd, __Cjstrd); + auto info = armpl_dgemm_interleave_batch(__ninter, __nbatch, __transa, __transb, __Cm, __Cn, + std::is_same::value ? __An : __Am, __alpha, + __Adp, __Abstrd, __Aistrd, __Ajstrd, __Bdp, __Bbstrd, __Bistrd, __Bjstrd, + __beta, __Cdp, __Cbstrd, __Cistrd, __Cjstrd); if (info != ARMPL_STATUS_SUCCESS) { std::ostringstream os; os << "armpl_dgemm_interleave_batch returned :" << info << std::endl; @@ -193,8 +183,7 @@ class BatchedArmplGemm { std::enable_if_t::value, void> __run(T &) {} public: - BatchedArmplGemm(HandleType *const handle, ScalarType alpha, AViewType A, - BViewType B, ScalarType beta, CViewType C) + BatchedArmplGemm(HandleType *const handle, ScalarType alpha, AViewType A, BViewType B, ScalarType beta, CViewType C) : __handle(handle), __A(A), __B(B), __C(C), __alpha(alpha), __beta(beta) { __ninter = __handle->get_tpl_params()[0]; @@ -234,15 +223,11 @@ class BatchedArmplGemm { int invoke() { if (__handle->enableDebug) { - std::cerr << "__nbatch:" << std::to_string(__nbatch) - << ", __ninter:" << std::to_string(__ninter) - << ", __Am:" << std::to_string(__Am) - << ", __An:" << std::to_string(__An) << std::endl; + std::cerr << "__nbatch:" << std::to_string(__nbatch) << ", __ninter:" << std::to_string(__ninter) + << ", __Am:" << std::to_string(__Am) << ", __An:" << std::to_string(__An) << std::endl; } - if (!std::is_same::value || - !std::is_same::value || - !std::is_same::value || + if (!std::is_same::value || !std::is_same::value || !std::is_same::value || !std::is_same::value) { std::ostringstream os; os << "KokkosBatched::Impl::BatchedArmplGemm only supports 'double' " @@ -254,8 +239,7 @@ class BatchedArmplGemm { if (__nbatch != 0) { if (__ninter == 0 || __nbatch % __ninter) { std::ostringstream os; - os << "batch size must be evenly divisible by ninter. __nbatch: " - << std::to_string(__nbatch) + os << "batch size must be evenly divisible by ninter. __nbatch: " << std::to_string(__nbatch) << ", __ninter: " << std::to_string(__ninter) << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp index 50d662b281..6888de725d 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp @@ -126,15 +126,13 @@ using TagFromLayout = typename TagFromLayoutHelper::tag; /// ScalarType, AViewType, BViewType, CViewType /// ArgBoundsCheck, tile_m, tile_n, tile_k>(alpha, A, B, beta, C).invoke(); // clang-format on -template +template class BatchedDblBufGemm { private: using AlphaMulTag = - std::conditional_t::value, - AlphaTag::No, AlphaTag::Yes>; + std::conditional_t::value, AlphaTag::No, AlphaTag::Yes>; HandleType *const __handle; AViewType __A; @@ -153,20 +151,12 @@ class BatchedDblBufGemm { using layout_type = typename CViewType::array_layout; using device_type = typename CViewType::device_type; using execution_space_type = typename device_type::execution_space; - using scratch_space_type = - typename execution_space_type::scratch_memory_space; - using view_type_2d_scratch = - Kokkos::View; + using scratch_space_type = typename execution_space_type::scratch_memory_space; + using view_type_2d_scratch = Kokkos::View; public: - BatchedDblBufGemm(HandleType *const handle, ScalarType alpha, AViewType A, - BViewType B, ScalarType beta, CViewType C) - : __handle(handle), - __A(A), - __B(B), - __C(C), - __alpha(alpha), - __beta(beta) {} + BatchedDblBufGemm(HandleType *const handle, ScalarType alpha, AViewType A, BViewType B, ScalarType beta, CViewType C) + : __handle(handle), __A(A), __B(B), __C(C), __alpha(alpha), __beta(beta) {} int invoke() { __run(); @@ -175,8 +165,7 @@ class BatchedDblBufGemm { private: void __run() { - using policy_type = - Kokkos::TeamPolicy, execution_space_type>; + using policy_type = Kokkos::TeamPolicy, execution_space_type>; using member_type = typename policy_type::member_type; // Compile-time expressions required for functor-level register allocations: @@ -190,7 +179,7 @@ class BatchedDblBufGemm { constexpr int reg_n = TILE_N / TILE_K + 2 * !!(TILE_N % TILE_K); constexpr int stride_m = TILE_K; constexpr int stride_n = TILE_N / reg_n; - using functor_type = Functor; + using functor_type = Functor; functor_type functor(*this, __A, __B, __C); @@ -211,43 +200,35 @@ class BatchedDblBufGemm { int vector_len = stride_n; const int max_team_size = - policy_type(league_size, Kokkos::AUTO, vector_len) - .team_size_max(functor, Kokkos::ParallelForTag()); + policy_type(league_size, Kokkos::AUTO, vector_len).team_size_max(functor, Kokkos::ParallelForTag()); if (team_size > max_team_size) { std::ostringstream os; - os << "KokkosBatched::BatchedGemm with kernelAlgoType = " - << std::to_string(__handle->get_kernel_algo_type()) - << " does not support team_size > " << std::to_string(max_team_size) - << "." << std::endl + os << "KokkosBatched::BatchedGemm with kernelAlgoType = " << std::to_string(__handle->get_kernel_algo_type()) + << " does not support team_size > " << std::to_string(max_team_size) << "." << std::endl << " The tile dimensions must be adjusted." << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - const int max_vector_len = - policy_type(league_size, team_size, Kokkos::AUTO).vector_length_max(); + const int max_vector_len = policy_type(league_size, team_size, Kokkos::AUTO).vector_length_max(); if (vector_len > max_vector_len) { std::ostringstream os; - os << "KokkosBatched::BatchedGemm with kernelAlgoType = " - << std::to_string(__handle->get_kernel_algo_type()) - << " does not support vector_len > " << std::to_string(max_vector_len) - << "." << std::endl + os << "KokkosBatched::BatchedGemm with kernelAlgoType = " << std::to_string(__handle->get_kernel_algo_type()) + << " does not support vector_len > " << std::to_string(max_vector_len) << "." << std::endl << " The tile dimensions must be adjusted." << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (__handle->enableDebug) { - std::cout << "max_team_size:" << max_team_size - << " team_size:" << team_size << std::endl - << "max_vector_len:" << max_vector_len - << " vector_len:" << vector_len << std::endl + std::cout << "max_team_size:" << max_team_size << " team_size:" << team_size << std::endl + << "max_vector_len:" << max_vector_len << " vector_len:" << vector_len << std::endl << "TILE_M:" << TILE_M << std::endl << "TILE_N:" << TILE_N << std::endl << "TILE_K:" << TILE_K << std::endl; } // TODO: Use statically allocated shmem - int shmem_size = view_type_2d_scratch::shmem_size(TILE_M, TILE_K) + - view_type_2d_scratch::shmem_size(TILE_K, TILE_N); + int shmem_size = + view_type_2d_scratch::shmem_size(TILE_M, TILE_K) + view_type_2d_scratch::shmem_size(TILE_K, TILE_N); // Each member solves a portion of TILE_K in parallel with other members policy_type team_policy(league_size, team_size, vector_len); @@ -278,8 +259,7 @@ class BatchedDblBufGemm { // below. If those are used, we get an invalid memory error from cuda. I // suspect this is due the values not being copied to device and then // runtime resolution of the host address &__ei. - Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C) - : __ei(ei), __A(A), __B(B), __C(C) { + Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C) : __ei(ei), __A(A), __B(B), __C(C) { if (std::is_same::value) { ei.__c_batch_size = ei.__C.extent_int(0); ei.__c_m = ei.__C.extent_int(1); @@ -310,24 +290,17 @@ class BatchedDblBufGemm { } KOKKOS_INLINE_FUNCTION - void __mul(view_value_type a, view_value_type b, view_value_type &c, - const AlphaTag::No &) const { - c += a * b; - } + void __mul(view_value_type a, view_value_type b, view_value_type &c, const AlphaTag::No &) const { c += a * b; } KOKKOS_INLINE_FUNCTION - void __mul(view_value_type a, view_value_type b, view_value_type &c, - const AlphaTag::Yes &) const { + void __mul(view_value_type a, view_value_type b, view_value_type &c, const AlphaTag::Yes &) const { c += a * b * __alpha; } KOKKOS_INLINE_FUNCTION - void __rshmem_and_mul(const int &thread_id, const int &vlane_id, - const unsigned &nk, view_value_type reg_a[REG_M], - view_value_type reg_b[REG_N], - view_value_type reg_c[REG_M][REG_N], - view_type_2d_scratch &svA_scr, - view_type_2d_scratch &svB_scr) const { + void __rshmem_and_mul(const int &thread_id, const int &vlane_id, const unsigned &nk, view_value_type reg_a[REG_M], + view_value_type reg_b[REG_N], view_value_type reg_c[REG_M][REG_N], + view_type_2d_scratch &svA_scr, view_type_2d_scratch &svB_scr) const { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL @@ -335,14 +308,12 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); + for (int m = 0; m < REG_M; ++m) reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + for (int n = 0; n < REG_N; ++n) reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -351,18 +322,15 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); + for (int n = 0; n < REG_N; ++n) __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); } } } KOKKOS_INLINE_FUNCTION - void __rshmem_and_mul_ll(const int &thread_id, const int &vlane_id, - const unsigned &nk, view_value_type reg_a[REG_M], - view_value_type reg_b[REG_N], - view_value_type reg_c[REG_M][REG_N], - view_type_2d_scratch &svA_scr, + void __rshmem_and_mul_ll(const int &thread_id, const int &vlane_id, const unsigned &nk, + view_value_type reg_a[REG_M], view_value_type reg_b[REG_N], + view_value_type reg_c[REG_M][REG_N], view_type_2d_scratch &svA_scr, view_type_2d_scratch &svB_scr) const { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -371,14 +339,12 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(k, vlane_id + m * STRIDE_M); + for (int m = 0; m < REG_M; ++m) reg_a[m] = svA_scr(k, vlane_id + m * STRIDE_M); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(thread_id + n * STRIDE_N, k); + for (int n = 0; n < REG_N; ++n) reg_b[n] = svB_scr(thread_id + n * STRIDE_N, k); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -387,8 +353,7 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); + for (int n = 0; n < REG_N; ++n) __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag); } } } @@ -401,8 +366,7 @@ class BatchedDblBufGemm { view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0}; // Allocate registers used for FMAs - view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, - reg_c[REG_M][REG_N] = {{0}}; + view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, reg_c[REG_M][REG_N] = {{0}}; // TODO: look at local loads and stores via nvprof // TODO: look at GPU trace in nvprof to find out how many registers are // used. @@ -417,147 +381,124 @@ class BatchedDblBufGemm { int kk; // Fetch entire 2-rank sub-matrix - auto svA = subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transA_tag); - auto svB = subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transB_tag); - auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag); + auto svA = + subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transA_tag); + auto svB = + subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transB_tag); + auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag); // Allocate scratch memory buffers used for prefetching view_type_2d_scratch svA_scr(member.team_scratch(0), TILE_M, TILE_K); view_type_2d_scratch svB_scr(member.team_scratch(0), TILE_K, TILE_N); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, STRIDE_M), - [&](const int &thread_id) { - int m_offset = thread_id + start_m; + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_M), [&](const int &thread_id) { + int m_offset = thread_id + start_m; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, STRIDE_N), - [&](const int &vlane_id) { - int n_offset = vlane_id + start_n; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_N), [&](const int &vlane_id) { + int n_offset = vlane_id + start_n; // Here we populate scratch memory with one or more "k" tiles for // every thread of the team! #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id, vlane_id + i) = - access_view_bounds_check( - svB, thread_id, n_offset + i, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) + svB_scr(thread_id, vlane_id + i) = + access_view_bounds_check(svB, thread_id, n_offset + i, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(thread_id + i, vlane_id) = - access_view_bounds_check( - svA, m_offset + i, vlane_id, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + svA_scr(thread_id + i, vlane_id) = + access_view_bounds_check(svA, m_offset + i, vlane_id, __ei.__bounds_check_tag); - // Wait for A, B to reside in scratch memory - member.team_barrier(); + // Wait for A, B to reside in scratch memory + member.team_barrier(); - // Each thread calculates a single dot product in chunks of - // size TILE_K - for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { - int k_tile_offset = kk + TILE_K; + // Each thread calculates a single dot product in chunks of + // size TILE_K + for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { + int k_tile_offset = kk + TILE_K; // Get this threads next TILE_K entries from global memory // Each thread has its own copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - prefetch_reg_b[i] = - access_view_bounds_check( - svB, thread_id + k_tile_offset, - n_offset + i * STRIDE_N, __ei.__bounds_check_tag); + for (int i = 0; i < REG_N; ++i) + prefetch_reg_b[i] = access_view_bounds_check( + svB, thread_id + k_tile_offset, n_offset + i * STRIDE_N, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - prefetch_reg_a[i] = - access_view_bounds_check( - svA, m_offset + i * STRIDE_M, - vlane_id + k_tile_offset, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M; ++i) + prefetch_reg_a[i] = access_view_bounds_check( + svA, m_offset + i * STRIDE_M, vlane_id + k_tile_offset, __ei.__bounds_check_tag); - __rshmem_and_mul(thread_id, vlane_id, TILE_K, reg_a, reg_b, - reg_c, svA_scr, svB_scr); + __rshmem_and_mul(thread_id, vlane_id, TILE_K, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // Wait for: - // 1. prefetch_regs to be populated - // 2. for shmem to no longer be read from - member.team_barrier(); + // Wait for: + // 1. prefetch_regs to be populated + // 2. for shmem to no longer be read from + member.team_barrier(); // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - svB_scr(thread_id, vlane_id + i * STRIDE_N) = - prefetch_reg_b[i]; + for (int i = 0; i < REG_N; ++i) svB_scr(thread_id, vlane_id + i * STRIDE_N) = prefetch_reg_b[i]; // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_a. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - svA_scr(thread_id + i * STRIDE_M, vlane_id) = - prefetch_reg_a[i]; + for (int i = 0; i < REG_M; ++i) svA_scr(thread_id + i * STRIDE_M, vlane_id) = prefetch_reg_a[i]; - // Wait for shmem stores to land before performing next - // TILE_K multiply - member.team_barrier(); - } // end n_tile_k_tiles loop + // Wait for shmem stores to land before performing next + // TILE_K multiply + member.team_barrier(); + } // end n_tile_k_tiles loop - // Multiply last tile, may be a partial tile - __rshmem_and_mul(thread_id, vlane_id, __k - kk, reg_a, reg_b, - reg_c, svA_scr, svB_scr); + // Multiply last tile, may be a partial tile + __rshmem_and_mul(thread_id, vlane_id, __k - kk, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // store results back to global memory - if (__beta == 0.0F) { + // store results back to global memory + if (__beta == 0.0F) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } else { + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __ei.__alpha_fma_tag, __ei.__bounds_check_tag); + } + } + } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __beta, __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } - }); - }); + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __beta, __ei.__alpha_fma_tag, + __ei.__bounds_check_tag); + } + } + } + }); + }); } KOKKOS_INLINE_FUNCTION @@ -568,8 +509,7 @@ class BatchedDblBufGemm { view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0}; // Allocate registers used for FMAs - view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, - reg_c[REG_M][REG_N] = {{0}}; + view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, reg_c[REG_M][REG_N] = {{0}}; // TODO: look at local loads and stores via nvprof // TODO: look at GPU trace in nvprof to find out how many registers are // used. @@ -584,149 +524,126 @@ class BatchedDblBufGemm { int kk; // Fetch entire 2-rank sub-matrix - auto svA = subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transA_tag); - auto svB = subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag, __ei.__transB_tag); - auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), - __ei.__batch_layout_tag); + auto svA = + subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transA_tag); + auto svB = + subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag, __ei.__transB_tag); + auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(), __ei.__batch_layout_tag); // Allocate scratch memory buffers used for prefetching view_type_2d_scratch svA_scr(member.team_scratch(0), TILE_K, TILE_M); view_type_2d_scratch svB_scr(member.team_scratch(0), TILE_N, TILE_K); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, STRIDE_N), - [&](const int &thread_id) { - int n_offset = thread_id + start_n; + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_N), [&](const int &thread_id) { + int n_offset = thread_id + start_n; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, STRIDE_M), - [&](const int &vlane_id) { - int m_offset = vlane_id + start_m; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_M), [&](const int &vlane_id) { + int m_offset = vlane_id + start_m; // Here we populate scratch memory with one or more "k" tiles for // every thread of the team! #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id + i, vlane_id) = - access_view_bounds_check( - svB, vlane_id, n_offset + i, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) + svB_scr(thread_id + i, vlane_id) = + access_view_bounds_check(svB, vlane_id, n_offset + i, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(thread_id, vlane_id + i) = - access_view_bounds_check( - svA, m_offset + i, thread_id, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + svA_scr(thread_id, vlane_id + i) = + access_view_bounds_check(svA, m_offset + i, thread_id, __ei.__bounds_check_tag); - // Wait for A, B to reside in scratch memory - member.team_barrier(); + // Wait for A, B to reside in scratch memory + member.team_barrier(); - // Each thread calculates a single dot product in chunks of - // size TILE_K - for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { - int k_tile_offset = kk + TILE_K; + // Each thread calculates a single dot product in chunks of + // size TILE_K + for (kk = 0; kk < __k - TILE_K; kk += TILE_K) { + int k_tile_offset = kk + TILE_K; // Get this threads next TILE_K entries from global memory // Each thread has its own copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - prefetch_reg_b[i] = - access_view_bounds_check( - svB, vlane_id + k_tile_offset, - n_offset + i * STRIDE_N, __ei.__bounds_check_tag); + for (int i = 0; i < REG_N; ++i) + prefetch_reg_b[i] = access_view_bounds_check( + svB, vlane_id + k_tile_offset, n_offset + i * STRIDE_N, __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - prefetch_reg_a[i] = - access_view_bounds_check( - svA, m_offset + i * STRIDE_M, - thread_id + k_tile_offset, - __ei.__bounds_check_tag); + for (int i = 0; i < REG_M; ++i) + prefetch_reg_a[i] = access_view_bounds_check( + svA, m_offset + i * STRIDE_M, thread_id + k_tile_offset, __ei.__bounds_check_tag); - __rshmem_and_mul_ll(thread_id, vlane_id, TILE_K, reg_a, - reg_b, reg_c, svA_scr, svB_scr); + __rshmem_and_mul_ll(thread_id, vlane_id, TILE_K, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // Wait for: - // 1. prefetch_regs to be populated - // 2. for shmem to no longer be read from - member.team_barrier(); + // Wait for: + // 1. prefetch_regs to be populated + // 2. for shmem to no longer be read from + member.team_barrier(); // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - svB_scr(thread_id + i * STRIDE_N, vlane_id) = - prefetch_reg_b[i]; + for (int i = 0; i < REG_N; ++i) svB_scr(thread_id + i * STRIDE_N, vlane_id) = prefetch_reg_b[i]; // populate shmem from prefetch registers. Each thread has its own // copy of prefetch_reg_a. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - svA_scr(thread_id, vlane_id + i * STRIDE_M) = - prefetch_reg_a[i]; + for (int i = 0; i < REG_M; ++i) svA_scr(thread_id, vlane_id + i * STRIDE_M) = prefetch_reg_a[i]; - // Wait for shmem stores to land before performing next - // TILE_K multiply - member.team_barrier(); - } // end n_tile_k_tiles loop + // Wait for shmem stores to land before performing next + // TILE_K multiply + member.team_barrier(); + } // end n_tile_k_tiles loop - // Multiply last tile, may be a partial tile - __rshmem_and_mul_ll(thread_id, vlane_id, __k - kk, reg_a, - reg_b, reg_c, svA_scr, svB_scr); + // Multiply last tile, may be a partial tile + __rshmem_and_mul_ll(thread_id, vlane_id, __k - kk, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // store results back to global memory - if (__beta == 0.0F) { + // store results back to global memory + if (__beta == 0.0F) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } else { + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __ei.__alpha_fma_tag, __ei.__bounds_check_tag); + } + } + } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = n_offset + n * STRIDE_N; + for (int n = 0; n < REG_N; ++n) { + int cn = n_offset + n * STRIDE_N; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = m_offset + m * STRIDE_M; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, - __beta, __ei.__alpha_fma_tag, - __ei.__bounds_check_tag); - } - } - } - }); - }); + for (int m = 0; m < REG_M; ++m) { + int cm = m_offset + m * STRIDE_M; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha, __beta, __ei.__alpha_fma_tag, + __ei.__bounds_check_tag); + } + } + } + }); + }); } }; }; diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 464ea6d04a..6216aeb099 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -62,30 +62,21 @@ constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { #endif // __CUDAACC_RDC__ } -template -int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, +template +int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { int ret = 0; size_t c_m, c_n; using ViewValueType = typename CViewType::value_type; // Check for valid input views - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "CViewType must be a Kokkos::View."); + static_assert(std::is_same::value || std::is_same::value, + "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); + static_assert(std::is_same::value || std::is_same::value, + "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); if constexpr (is_vector::value) { // Check ranks of view with underlying SIMD value types // For SIMD views, we can have either 3-rank or 4-ranks inputs. @@ -100,31 +91,27 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, default: std::ostringstream os; os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) - << " with SIMD views." << std::endl; + << std::to_string(handle->get_kernel_algo_type()) << " with SIMD views." << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); break; } } else { // Check ranks of views with underlying scalar value types - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); + static_assert(static_cast(AViewType::rank) == 3, "AViewType must have rank 3."); + static_assert(static_cast(BViewType::rank) == 3, "BViewType must have rank 3."); + static_assert(static_cast(CViewType::rank) == 3, "CViewType must have rank 3."); } // Check for valid data access patterns // Skip checking a_layout == b_layout == c_layout // Skip checking for LayoutStride using c_layout = typename CViewType::array_layout; - static_assert(!(std::is_same::value && - !std::is_same::value), - "LayoutLeft views require BatchLayout::Right"); - static_assert(!(std::is_same::value && - !std::is_same::value), - "LayoutRight views require BatchLayout::Left"); + static_assert( + !(std::is_same::value && !std::is_same::value), + "LayoutLeft views require BatchLayout::Right"); + static_assert( + !(std::is_same::value && !std::is_same::value), + "LayoutRight views require BatchLayout::Left"); if constexpr (std::is_same::value) { // c_b = C.extent(0); @@ -141,17 +128,13 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, using layout_type = typename CViewType::array_layout; using exec_space = typename CViewType::execution_space; constexpr bool is_vector = KokkosBatched::is_vector::value; - constexpr bool on_gpu = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< - typename exec_space::memory_space>(); - constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< - typename exec_space::memory_space>(); - bool out_of_range = false; + constexpr bool on_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space(); + constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space(); + bool out_of_range = false; if (handle->enableDebug) { - std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() - << std::endl + std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() << std::endl << "execution_space:" << typeid(exec_space).name() << std::endl << std::endl << "is_vector:" << is_vector << std::endl @@ -166,79 +149,58 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, if (c_m != c_n) { std::ostringstream os; os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" - << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" - << std::endl; + << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" << std::to_string(c_m) << ") != c_n(" + << std::to_string(c_n) << ")" << std::endl; KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Select optimal resultsPerThread param for BatchedSerialGemm using bsgResultsPerThread = - std::conditional_t; + std::conditional_t; // Select optimal mode param for SerialGemm. using bsgModeType = typename std::conditional< - is_vector, - typename std::conditional::type, + is_vector, typename std::conditional::type, typename std::conditional< on_gpu, Algo::Gemm::Unblocked, - typename std::conditional::type>::type>:: - type; + typename std::conditional::type>::type>::type; if (handle->enableDebug) { - std::cout << "bsgResultsPerThread: " - << typeid(bsgResultsPerThread).name() << std::endl + std::cout << "bsgResultsPerThread: " << typeid(bsgResultsPerThread).name() << std::endl << "bsgModeType: " << typeid(bsgModeType).name() << std::endl; } if constexpr (on_gpu) { - if (((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { - handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = Impl::kk_gemm_dbl_buf_tile_m(); - constexpr int tile_n = Impl::kk_gemm_dbl_buf_tile_n(); - constexpr int tile_k = Impl::kk_gemm_dbl_buf_tile_k(); - constexpr size_t alpha_in_fma_thresh = - Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); + if (((std::is_same::value) ? (c_m >= 16) + : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { + handle->teamSz = handle->vecLen = 8; + constexpr int tile_m = Impl::kk_gemm_dbl_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dbl_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dbl_buf_tile_k(); + constexpr size_t alpha_in_fma_thresh = Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); if (c_m % 32 == 0) { // No bounds checking if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } else { // apply alpha in mul - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } } else { // bounds checking if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } else { // apply alpha in mul - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, beta, C) .invoke(); } } @@ -247,10 +209,8 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, } } if (!on_gpu || out_of_range) { - ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) .invoke(); } break; @@ -261,10 +221,8 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, ////////////// TPL ALGOS ////////////// #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 case BaseTplAlgos::ARMPL: - ret = Impl::BatchedArmplGemm(handle, alpha, A, B, - beta, C) + ret = Impl::BatchedArmplGemm(handle, alpha, A, B, beta, C) .invoke(); break; #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL @@ -276,23 +234,17 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, ////////////// KokkosBatched ALGOS ////////////// case BaseKokkosBatchedAlgos::KK_SERIAL: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); break; // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); break; // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: @@ -308,11 +260,9 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, // performance. // TODO: Add auto-selection of tile size based on inputs and device type - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) + ret = Impl::BatchedDblBufGemm(handle, alpha, A, B, + beta, C) .invoke(); break; diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index 5ff581bb64..8da3c7acd1 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -76,9 +76,8 @@ namespace Impl { /// ArgResultsPerThread, ScalarType, AViewType, /// BViewType, CViewType>(alpha, A, B, beta, C).invoke(); // clang-format on -template +template class BatchedSerialGemm { private: AViewType A; @@ -92,10 +91,8 @@ class BatchedSerialGemm { void run() { using execution_space = typename CViewType::device_type::execution_space; - using policy_type = - Kokkos::RangePolicy; - Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), - *this); + using policy_type = Kokkos::RangePolicy; + Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), *this); } public: @@ -117,8 +114,7 @@ class BatchedSerialGemm { batch_size *= divisor; run(); - } else if (std::is_same::value) { + } else if (std::is_same::value) { if (std::is_same::value) batch_size = C.extent(0); else @@ -132,8 +128,7 @@ class BatchedSerialGemm { return 0; } - BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, - ScalarType _beta, CViewType _C) + BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, ScalarType _beta, CViewType _C) : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {} KOKKOS_INLINE_FUNCTION @@ -149,34 +144,26 @@ class BatchedSerialGemm { // Due to taking 1-rank subviews out, we must handle transpose here. // Use overloads of subview_wrapper to handle transpose at compile time. - auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), - batch_layout_tag, transA_tag); - auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, - batch_layout_tag, transB_tag); - auto svC_ele = - subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); + auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), batch_layout_tag, transA_tag); + auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, batch_layout_tag, transB_tag); + auto svC_ele = subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); // Kokkos::subview(scalar, ALL) or Kokkos::subview(ALL, scalar) always // returns a column vector. Since the subviews above handle the // matrix transpositions, here we must perform the GEMM on: // row_vec x col_vec, which is svA_row' x svB_col to compute the element // of C. - KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, - svC_ele); + KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, + svC_ele); } KOKKOS_INLINE_FUNCTION void operator()(const ResultsPerThread::Rank2 &, const int &i) const { - auto svA = - subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svB = - subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svC = - subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svA = subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svB = subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svC = subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - KokkosBatched::SerialGemm::invoke( - alpha, svA, svB, beta, svC); + KokkosBatched::SerialGemm::invoke(alpha, svA, svB, beta, svC); } }; } // namespace Impl diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp index 6ec792172b..6f06694f09 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp @@ -29,17 +29,15 @@ namespace KokkosBatched { namespace Impl { // Specialization struct which defines whether a specialization exists // This struct is currently never specialized. -template +template struct batched_gemm_tpl_spec_avail { enum : bool { value = false }; }; // Specialization struct which defines whether a specialization exists -template +template struct batched_gemm_eti_spec_avail { enum : bool { value = false }; }; @@ -47,71 +45,55 @@ struct batched_gemm_eti_spec_avail { } // namespace KokkosBatched // ETI specalization macros, consumed by generated *_eti_spec_avail.hpp files -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ - ARG_BATCH_LAYOUT, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct batched_gemm_eti_spec_avail< \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct batched_gemm_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif ///////////////// BatchLayout::Left Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) // Include the BLL ETI specalizations #include @@ -120,29 +102,21 @@ struct batched_gemm_eti_spec_avail { #include ///////////////// BatchLayout::Right Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) // Include the BLR ETI specalizations #include @@ -152,19 +126,15 @@ struct batched_gemm_eti_spec_avail { namespace KokkosBatched { namespace Impl { -template ::value, - bool eti_spec_avail = batched_gemm_eti_spec_avail< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType>::value> +template ::value, + bool eti_spec_avail = batched_gemm_eti_spec_avail::value> struct BatchedGemmSpec { - static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C) + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, + const ScalarType beta, const CViewType &C) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION @@ -172,23 +142,20 @@ struct BatchedGemmSpec { printf( "KokkosBatched::BatchedGemm<> ETI specialization for < %s, %s, %s, " "%s, %s, %s, %s, %s >\n", - typeid(ArgTransA).name(), typeid(ArgTransB).name(), - typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), - typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(ArgTransA).name(), typeid(ArgTransB).name(), typeid(ArgBatchSzDim).name(), + typeid(BatchedGemmHandleType).name(), typeid(ScalarType).name(), typeid(AViewType).name(), typeid(BViewType).name(), typeid(CViewType).name()); #else printf( "KokkosBatched::BatchedGemm<> non-ETI specialization for < %s, %s, " "%s, %s, %s, %s, %s, %s >\n", - typeid(ArgTransA).name(), typeid(ArgTransB).name(), - typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), - typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(ArgTransA).name(), typeid(ArgTransB).name(), typeid(ArgBatchSzDim).name(), + typeid(BatchedGemmHandleType).name(), typeid(ScalarType).name(), typeid(AViewType).name(), typeid(BViewType).name(), typeid(CViewType).name()); #endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #endif // KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - return KokkosBatched::Impl::BatchedGemmImpl< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, - AViewType, BViewType, CViewType>(handle, alpha, A, B, beta, C); + return KokkosBatched::Impl::BatchedGemmImpl(handle, alpha, A, B, beta, C); } #else ; @@ -199,92 +166,68 @@ struct BatchedGemmSpec { } // namespace KokkosBatched // ETI instantiation macros, consumed by *.cpp.in files -#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ - ARG_BATCH_LAYOUT, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct BatchedGemmSpec< \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - false, true>; +#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template struct BatchedGemmSpec, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) #else -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif ///////////////// BatchLayout::Left Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) ///////////////// BatchLayout::Right Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) -#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) #endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp index 4a3e26685b..c8f5c7a20e 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp @@ -29,11 +29,9 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int SerialHouseholder::invoke( - const aViewType &a, const tauViewType &tau) { - return SerialLeftHouseholderInternal::invoke(a.extent(0) - 1, a.data(), - a.data() + a.stride(0), - a.stride(0), tau.data()); +KOKKOS_INLINE_FUNCTION int SerialHouseholder::invoke(const aViewType &a, const tauViewType &tau) { + return SerialLeftHouseholderInternal::invoke(a.extent(0) - 1, a.data(), a.data() + a.stride(0), a.stride(0), + tau.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp index 05654a2f37..0257ff4d9b 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp @@ -61,8 +61,7 @@ struct SerialLeftHouseholderInternal { const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::ArithTraits::sqrt( - norm_x2_square + norm_chi1 * norm_chi1); + const mag_type norm_x = Kokkos::ArithTraits::sqrt(norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha const mag_type alpha = (*chi1 < 0 ? one : minus_one) * norm_x; @@ -76,9 +75,8 @@ struct SerialLeftHouseholderInternal { // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s); /// compute tau - const mag_type chi1_minus_alpha_square = - chi1_minus_alpha * chi1_minus_alpha; - *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); + const mag_type chi1_minus_alpha_square = chi1_minus_alpha * chi1_minus_alpha; + *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); /// overwrite chi1 with alpha *chi1 = alpha; diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp index 955e1a72b8..bc55a646bc 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp @@ -29,11 +29,10 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int TeamVectorHouseholder::invoke( - const MemberType &member, const aViewType &a, const tauViewType &tau) { - return TeamVectorLeftHouseholderInternal::invoke( - member, a.extent(0) - 1, a.data(), a.data() + a.stride(0), a.stride(0), - tau.data()); +KOKKOS_INLINE_FUNCTION int TeamVectorHouseholder::invoke(const MemberType &member, const aViewType &a, + const tauViewType &tau) { + return TeamVectorLeftHouseholderInternal::invoke(member, a.extent(0) - 1, a.data(), a.data() + a.stride(0), + a.stride(0), tau.data()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp index 64fe24fa31..1074dc4280 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -30,8 +30,7 @@ namespace KokkosBatched { /// struct TeamVectorLeftHouseholderInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m_x2, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m_x2, /* */ ValueType *chi1, /* */ ValueType *x2, const int x2s, /* */ ValueType *tau) { @@ -67,8 +66,7 @@ struct TeamVectorLeftHouseholderInternal { const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::ArithTraits::sqrt( - norm_x2_square + norm_chi1 * norm_chi1); + const mag_type norm_x = Kokkos::ArithTraits::sqrt(norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha const mag_type alpha = (*chi1 < 0 ? one : minus_one) * norm_x; @@ -76,9 +74,8 @@ struct TeamVectorLeftHouseholderInternal { /// overwrite x2 with u2 const value_type chi1_minus_alpha = *chi1 - alpha; const value_type inv_chi1_minus_alpha = one / chi1_minus_alpha; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, m_x2), - [&](const int &i) { x2[i * x2s] *= inv_chi1_minus_alpha; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m_x2), + [&](const int &i) { x2[i * x2s] *= inv_chi1_minus_alpha; }); member.team_barrier(); // later consider to use the following @@ -86,9 +83,8 @@ struct TeamVectorLeftHouseholderInternal { /// compute tau Kokkos::single(Kokkos::PerTeam(member), [&]() { - const mag_type chi1_minus_alpha_square = - chi1_minus_alpha * chi1_minus_alpha; - *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); + const mag_type chi1_minus_alpha_square = chi1_minus_alpha * chi1_minus_alpha; + *tau = half + half * (norm_x2_square / chi1_minus_alpha_square); /// overwrite chi1 with alpha *chi1 = alpha; diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp index d59f9e0c0b..eb576f1dff 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp @@ -29,31 +29,26 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], a_40 = A[4 * _as0 + 0 * _as1], + a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], a_44 = A[4 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, b_4p, c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -100,29 +95,25 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1], a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, - ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, + ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -163,27 +154,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, /**/ c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, + ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -217,25 +205,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_40 = A[4 * _as0 + 0 * _as1], + a_41 = A[4 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, /**/ c_2p, /**/ c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, - ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, + ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -263,15 +250,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_40 = A[4 * _as0 + 0 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_40 = A[4 * _as0 + 0 * _as1]; ValueType b_0p, c_0p, /**/ c_1p, @@ -279,8 +265,7 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke( /**/ c_3p, /**/ c_4p; - const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0, ic4 = 4 * _cs0; + const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -303,28 +288,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -365,25 +346,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0, ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -417,22 +395,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, b_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0, + ic1 = 1 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -460,20 +437,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1]; ValueType b_0p, c_0p, b_1p, b_2p, b_3p, b_4p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ib4 = 4 * _bs0, ic0 = 0 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ib4 = 4 * _bs0, ic0 = 0 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -496,10 +471,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -548,12 +524,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); for (int i = 0; i < m; i += 2) for (int p = 0; p < k; p += 2) - inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, - (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2), - C + i * _cs0); + inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, (i + 2 > m ? 1 : 2), n, + (p + 2 > k ? 1 : 2), C + i * _cs0); } else { - Kokkos::abort( - "InnerGemmFixA<5,5>::serial_invoke, assert failure (m<5 && n<5)"); + Kokkos::abort("InnerGemmFixA<5,5>::serial_invoke, assert failure (m<5 && n<5)"); } break; } @@ -568,25 +542,23 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, + ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -622,24 +594,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, /**/ c_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, + ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -670,23 +640,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, /**/ c_2p, /**/ c_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, - ic2 = 2 * _cs0, ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -712,22 +680,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_30 = A[3 * _as0 + 0 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_30 = A[3 * _as0 + 0 * _as1]; ValueType b_0p, c_0p, /**/ c_1p, /**/ c_2p, /**/ c_3p; - const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, - ic3 = 3 * _cs0; + const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -748,23 +715,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, + ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -795,21 +760,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, b_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0, ic1 = 1 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -835,19 +798,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1]; ValueType b_0p, c_0p, b_1p, b_2p, b_3p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, - ic0 = 0 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0, ic0 = 0 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -868,10 +830,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -915,12 +878,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); for (int i = 0; i < m; i += 2) for (int p = 0; p < k; p += 2) - inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, - (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2), - C + i * _cs0); + inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, (i + 2 > m ? 1 : 2), n, + (p + 2 > k ? 1 : 2), C + i * _cs0); } else { - Kokkos::abort( - "InnerGemmFixA<4,4>::serial_invoke, assert failure (m<4 && n<4)"); + Kokkos::abort("InnerGemmFixA<4,4>::serial_invoke, assert failure (m<4 && n<4)"); } break; } @@ -935,22 +896,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0, ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -977,21 +935,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, /**/ c_2p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, - ic2 = 2 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -1014,14 +970,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_20 = A[2 * _as0 + 0 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1]; ValueType b_0p, c_0p, /**/ c_1p, @@ -1046,20 +1001,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p, b_2p; - const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, - ic1 = 1 * _cs0; + const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0; for (int p = 0; p < n; ++p) { b_0p = B[ib0 + p * _bs1]; @@ -1081,14 +1034,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1]; ValueType b_0p, c_0p, b_1p, b_2p; @@ -1111,10 +1063,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -1148,12 +1101,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); for (int i = 0; i < m; i += 2) for (int p = 0; p < k; p += 2) - inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, - (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2), - C + i * _cs0); + inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0, (i + 2 > m ? 1 : 2), n, + (p + 2 > k ? 1 : 2), C + i * _cs0); } else { - Kokkos::abort( - "InnerGemmFixA<3,3>::serial_invoke, assert failure (m<3 && n<3)"); + Kokkos::abort("InnerGemmFixA<3,3>::serial_invoke, assert failure (m<3 && n<3)"); } break; } @@ -1168,14 +1119,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; - const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1]; + const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1]; ValueType b_0p, c_0p, b_1p, c_1p; @@ -1199,10 +1150,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1]; @@ -1227,10 +1178,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1]; @@ -1254,10 +1205,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (m * 10 + k) { @@ -1282,8 +1234,7 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( break; } default: { - Kokkos::abort( - "InnerGemmFixA<2,2>::serial_invoke, assert failure (m<2 && n<2)"); + Kokkos::abort("InnerGemmFixA<2,2>::serial_invoke, assert failure (m<2 && n<2)"); break; } } @@ -1297,10 +1248,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int n, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, + /**/ ValueType *KOKKOS_RESTRICT C) { if (n <= 0) return 0; const ValueType a_00 = A[0 * _as0 + 0 * _as1]; diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp index a725bf5b45..6912c285a6 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp @@ -29,31 +29,26 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_24 = B[2 * _bs0 + 4 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], - b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], - b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], b_24 = B[2 * _bs0 + 4 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1], + b_41 = B[4 * _bs0 + 1 * _bs1], b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1], b_44 = B[4 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -100,28 +95,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], - b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_33 = B[3 * _bs0 + 3 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -162,25 +153,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], - b_42 = B[4 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], + b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1], b_42 = B[4 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1, jc2 = 2 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -214,22 +202,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1], + b_41 = B[4 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1, + jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -257,20 +244,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_40 = B[4 * _bs0 + 0 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_40 = B[4 * _bs0 + 0 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - ja4 = 4 * _as1, jc0 = 0 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, ja4 = 4 * _as1, jc0 = 0 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -293,28 +278,24 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_24 = B[2 * _bs0 + 4 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], b_24 = B[2 * _bs0 + 4 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, - jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, + jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -355,25 +336,22 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_24 = B[2 * _bs0 + 4 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], b_24 = B[2 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, + jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -407,22 +385,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], + b_14 = B[1 * _bs0 + 4 * _bs1]; ValueType a_p0, a_p1, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, - jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, + jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -450,20 +427,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_04 = B[0 * _bs0 + 4 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_04 = B[0 * _bs0 + 4 * _bs1]; ValueType a_p0, c_p0, c_p1, c_p2, c_p3, c_p4; - const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1, jc4 = 4 * _cs1; + const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -486,10 +461,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -544,25 +520,23 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], - b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], - b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1], + b_33 = B[3 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, + jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -598,23 +572,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], - b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], + b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, + jc2 = 2 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -645,21 +617,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1, jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -685,19 +655,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_30 = B[3 * _bs0 + 0 * _bs1]; ValueType a_p0, a_p1, a_p2, a_p3, c_p0; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, - jc0 = 0 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1, jc0 = 0 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -718,23 +687,21 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], + b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, + jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -765,21 +732,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1]; ValueType a_p0, a_p1, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, - jc2 = 2 * _cs1, jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -805,19 +770,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_03 = B[0 * _bs0 + 3 * _bs1]; ValueType a_p0, c_p0, c_p1, c_p2, c_p3; - const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, - jc3 = 3 * _cs1; + const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -838,10 +802,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -886,22 +851,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], - b_22 = B[2 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1], + b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1], b_22 = B[2 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1, jc2 = 2 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -928,20 +890,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -964,14 +924,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_20 = B[2 * _bs0 + 0 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], b_20 = B[2 * _bs0 + 0 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0; @@ -994,20 +953,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], - b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_02 = B[0 * _bs0 + 2 * _bs1], + b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0, c_p1; - const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, - jc1 = 1 * _cs1; + const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1; for (int p = 0; p < m; ++p) { a_p0 = A[p * _bs0 + ja0]; @@ -1029,14 +986,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, a_p2, c_p0; @@ -1059,10 +1016,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -1097,14 +1055,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; - const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], - b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1]; + const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1], + b_11 = B[1 * _bs0 + 1 * _bs1]; ValueType a_p0, a_p1, c_p0, c_p1; @@ -1128,10 +1086,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1]; @@ -1155,10 +1113,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1]; @@ -1182,10 +1140,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; switch (k * 10 + n) { @@ -1210,10 +1169,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0) return 0; const ValueType b_00 = B[0 * _bs0 + 0 * _bs1]; @@ -1239,10 +1198,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixB<0, 0>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixB<0, 0>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; if (k == n) { @@ -1276,10 +1236,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixB<0, 0>::serial_invoke( } else { for (int i = 0; i < m; ++i) { const ValueType *KOKKOS_RESTRICT iA = A + i * _as0; - /**/ ValueType *KOKKOS_RESTRICT iC = C + i * _cs0; + /**/ ValueType *KOKKOS_RESTRICT iC = C + i * _cs0; for (int j = 0; j < n; ++j) { const ValueType *KOKKOS_RESTRICT jB = B + j * _bs1; - /**/ ValueType tC = 0; + /**/ ValueType tC = 0; for (int p = 0; p < k; ++p) tC += iA[p * _as1] * jB[p * _bs0]; pC[i * _cs0] += alpha * tC; } diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp index 8bdf4fee4f..9ad08549cb 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp @@ -29,22 +29,19 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, - c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, - c_22 = 0, c_23 = 0, c_24 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, - c_33 = 0, c_34 = 0, a_4p, b_p4, c_40 = 0, c_41 = 0, c_42 = 0, c_43 = 0, - c_44 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0, a_3p, b_p3, + c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, c_34 = 0, a_4p, b_p4, c_40 = 0, c_41 = 0, c_42 = 0, + c_43 = 0, c_44 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, + j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -119,21 +116,18 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, a_1p, b_p1, - c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, a_2p, b_p2, - c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, a_3p, b_p3, - c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, a_4p, c_40 = 0, - c_41 = 0, c_42 = 0, c_43 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, + a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, + c_33 = 0, a_4p, c_40 = 0, c_41 = 0, c_42 = 0, c_43 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, + j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -197,19 +191,17 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, a_1p, b_p1, c_10 = 0, - c_11 = 0, c_12 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, - c_22 = 0, a_3p, c_30 = 0, c_31 = 0, c_32 = 0, a_4p, - c_40 = 0, c_41 = 0, c_42 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, a_2p, b_p2, c_20 = 0, + c_21 = 0, c_22 = 0, a_3p, c_30 = 0, c_31 = 0, c_32 = 0, a_4p, c_40 = 0, c_41 = 0, c_42 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, + j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -262,18 +254,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, - a_2p, c_20 = 0, c_21 = 0, a_3p, c_30 = 0, c_31 = 0, - a_4p, c_40 = 0, c_41 = 0; + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, a_2p, c_20 = 0, c_21 = 0, a_3p, c_30 = 0, + c_31 = 0, a_4p, c_40 = 0, c_41 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -315,17 +305,15 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, a_1p, c_10 = 0, a_2p, c_20 = 0, a_3p, - c_30 = 0, a_4p, c_40 = 0; + ValueType a_0p, b_p0, c_00 = 0, a_1p, c_10 = 0, a_2p, c_20 = 0, a_3p, c_30 = 0, a_4p, c_40 = 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0, j0 = 0 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0, j0 = 0 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -356,35 +344,32 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, - b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, - c_24 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, - c_33 = 0, c_34 = 0, + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0, a_3p, b_p3, + c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, c_34 = 0, /**/ b_p4; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, - j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, + j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; - a_2p = A[i2 + p * _as1]; - b_p2 = B[p * _bs0 + j2]; - a_3p = A[i3 + p * _as1]; - b_p3 = B[p * _bs0 + j3]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; + a_2p = A[i2 + p * _as1]; + b_p2 = B[p * _bs0 + j2]; + a_3p = A[i3 + p * _as1]; + b_p3 = B[p * _bs0 + j3]; /**/ b_p4 = B[p * _bs0 + j4]; c_00 += a_0p * b_p0; @@ -435,32 +420,30 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, - b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, - a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, - c_24 = 0, + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0, /**/ b_p3, /**/ b_p4; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, + j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; - a_2p = A[i2 + p * _as1]; - b_p2 = B[p * _bs0 + j2]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; + a_2p = A[i2 + p * _as1]; + b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; /**/ b_p4 = B[p * _bs0 + j4]; @@ -502,29 +485,28 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, - b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, + ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, + c_13 = 0, c_14 = 0, /**/ b_p2, /**/ b_p3, /**/ b_p4; - const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, - j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; /**/ b_p4 = B[p * _bs0 + j4]; @@ -557,10 +539,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, @@ -569,15 +551,14 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke( /**/ b_p3, /**/ b_p4; - const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1, j4 = 4 * _bs1; + const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; /**/ b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; @@ -604,22 +585,19 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), - c_03 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), - c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), - c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), a_3p, b_p3, - c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0), - c_33 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, + c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, + c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), a_3p, b_p3, + c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0), c_33 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, + j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -674,20 +652,17 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, - c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), a_3p, - c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), + c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), + c_22 = ValueType(0), a_3p, c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -733,19 +708,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), a_2p, - c_20 = ValueType(0), c_21 = ValueType(0), a_3p, - c_30 = ValueType(0), c_31 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), + a_2p, c_20 = ValueType(0), c_21 = ValueType(0), a_3p, c_30 = ValueType(0), c_31 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1, j1 = 1 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -782,17 +754,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, - c_20 = ValueType(0), a_3p, c_30 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, c_20 = ValueType(0), a_3p, + c_30 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - j0 = 0 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, j0 = 0 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -820,32 +791,29 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), - c_03 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), - c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), - c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), - /**/ b_p3; + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, + c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, + c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), + /**/ b_p3; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; - a_2p = A[i2 + p * _as1]; - b_p2 = B[p * _bs0 + j2]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; + a_2p = A[i2 + p * _as1]; + b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; c_00 += a_0p * b_p0; @@ -880,30 +848,27 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), - c_12 = ValueType(0), c_13 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1, + c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0), /**/ b_p2, /**/ b_p3; - const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, - j2 = 2 * _bs1, j3 = 3 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; @@ -931,27 +896,25 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), c_03 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0), /**/ b_p1, /**/ b_p2, /**/ b_p3; - const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, - j3 = 3 * _bs1; + const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; /**/ b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; /**/ b_p3 = B[p * _bs0 + j3]; @@ -976,19 +939,17 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, - c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, - c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), + c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), + c_22 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1, j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -1027,18 +988,16 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0), a_2p, - c_20 = ValueType(0), c_21 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), + a_2p, c_20 = ValueType(0), c_21 = ValueType(0); - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, - j1 = 1 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -1070,14 +1029,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, - c_20 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p, c_20 = ValueType(0); const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1; @@ -1104,28 +1062,26 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), /**/ b_p2; - const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, - j2 = 2 * _bs1; + const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; - a_1p = A[i1 + p * _as1]; - b_p1 = B[p * _bs0 + j1]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; + a_1p = A[i1 + p * _as1]; + b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; c_00 += a_0p * b_p0; @@ -1147,14 +1103,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), - c_02 = ValueType(0), + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), /**/ b_p1, /**/ b_p2; @@ -1164,8 +1119,8 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke( #pragma unroll #endif for (int p = 0; p < k; ++p) { - a_0p = A[i0 + p * _as1]; - b_p0 = B[p * _bs0 + j0]; + a_0p = A[i0 + p * _as1]; + b_p0 = B[p * _bs0 + j0]; /**/ b_p1 = B[p * _bs0 + j1]; /**/ b_p2 = B[p * _bs0 + j2]; @@ -1187,14 +1142,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; - ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, - c_10 = ValueType(0), c_11 = ValueType(0); + ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0); const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1; @@ -1223,10 +1177,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0); @@ -1253,10 +1207,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), @@ -1286,10 +1240,10 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (k <= 0) return 0; ValueType a_0p, b_p0, c_00 = ValueType(0); @@ -1311,10 +1265,11 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<0, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<0, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || k <= 0) return 0; switch (m) { @@ -1353,14 +1308,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<0, 1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 5 && n <= 5)) - Kokkos::abort( - "InnerGemmFixC<5,5>::serial_invoke, assert failure (m<=5 && n<=5)"); + if (!(m <= 5 && n <= 5)) Kokkos::abort("InnerGemmFixC<5,5>::serial_invoke, assert failure (m<=5 && n<=5)"); switch (m * 10 + n) { case 55: { @@ -1419,14 +1373,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 4 && n <= 4)) - Kokkos::abort( - "InnerGemmFixC<4,4>::serial_invoke, assert failure (m<=4 && n<=4)"); + if (!(m <= 4 && n <= 4)) Kokkos::abort("InnerGemmFixC<4,4>::serial_invoke, assert failure (m<=4 && n<=4)"); switch (m * 10 + n) { case 44: { @@ -1475,14 +1428,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 3 && n <= 3)) - Kokkos::abort( - "InnerGemmFixC<3,3>::serial_invoke, assert failure (m<=3 && n<=3)"); + if (!(m <= 3 && n <= 3)) Kokkos::abort("InnerGemmFixC<3,3>::serial_invoke, assert failure (m<=3 && n<=3)"); switch (m * 10 + n) { case 33: { @@ -1521,14 +1473,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 2 && n <= 2)) - Kokkos::abort( - "InnerGemmFixC<2,2>::serial_invoke, assert failure (m<=2 && n<=2)"); + if (!(m <= 2 && n <= 2)) Kokkos::abort("InnerGemmFixC<2,2>::serial_invoke, assert failure (m<=2 && n<=2)"); switch (m * 10 + n) { case 22: { @@ -1557,14 +1508,13 @@ KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke( - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { +KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke(const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { if (m <= 0 || n <= 0 || k <= 0) return 0; - if (!(m <= 1 && n <= 1)) - Kokkos::abort( - "InnerGemmFixC<1,1>::serial_invoke, assert failure (m<=1 && n<=1)"); + if (!(m <= 1 && n <= 1)) Kokkos::abort("InnerGemmFixC<1,1>::serial_invoke, assert failure (m<=1 && n<=1)"); return serial_invoke(alpha, A, B, k, C); ; diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp index 116545f653..a3d6dece58 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp @@ -25,43 +25,38 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke( - const MemberType &member, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, - const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, mb * nb), [&](const int &ij) { - const int i = ij / nb, j = ij % nb; +KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, mb * nb), [&](const int &ij) { + const int i = ij / nb, j = ij % nb; - const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, - *KOKKOS_RESTRICT pB = B + j * _bs1; + const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, *KOKKOS_RESTRICT pB = B + j * _bs1; - ValueType c = 0; - for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; - C[i * _cs0 + j * _cs1] += alpha * c; - }); + ValueType c = 0; + for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; + C[i * _cs0 + j * _cs1] += alpha * c; + }); return 0; } template template -KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke( - const MemberType &member, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, const int k, - /**/ ValueType *KOKKOS_RESTRICT C) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { - const int i = ij / n, j = ij % n; +KOKKOS_INLINE_FUNCTION int InnerGemmFixC::team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, + const int n, const int k, + /**/ ValueType *KOKKOS_RESTRICT C) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + const int i = ij / n, j = ij % n; - const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, - *KOKKOS_RESTRICT pB = B + j * _bs1; + const ValueType *KOKKOS_RESTRICT pA = A + i * _as0, *KOKKOS_RESTRICT pB = B + j * _bs1; - ValueType c = 0; - for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; - C[i * _cs0 + j * _cs1] += alpha * c; - }); + ValueType c = 0; + for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0]; + C[i * _cs0 + j * _cs1] += alpha * c; + }); return 0; } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp index 3089d068bb..0d74598b24 100644 --- a/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp @@ -29,21 +29,16 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], + a_14 = A[1 * _as0 + 4 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1], a_40 = A[4 * _as0 + 0 * _as1], + a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1], a_44 = A[4 * _as0 + 4 * _as1]; // 0 iteration @@ -121,17 +116,14 @@ KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], - a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], - a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], - a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1]; + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_03 = A[0 * _as0 + 3 * _as1], a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], + a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_20 = A[2 * _as0 + 0 * _as1], + a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_33 = A[3 * _as0 + 3 * _as1]; // 0 iteration a_10 /= a_00; @@ -178,14 +170,11 @@ KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1], - a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], - a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], - a_22 = A[2 * _as0 + 2 * _as1]; + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], + a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1], + a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], a_22 = A[2 * _as0 + 2 * _as1]; // 0 iteration a_10 /= a_00; @@ -212,11 +201,10 @@ KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( - ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke(ValueType *KOKKOS_RESTRICT A) { // load - ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], - a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1]; + ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1], a_10 = A[1 * _as0 + 0 * _as1], + a_11 = A[1 * _as0 + 1 * _as1]; // 0 iteration a_10 /= a_00; @@ -231,15 +219,13 @@ KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke( - ValueType *KOKKOS_RESTRICT /* A */) { +KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke(ValueType *KOKKOS_RESTRICT /* A */) { return 0; } template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 5) Kokkos::abort("InnerLU<5>::serial_invoke, assert failure (m<=5)"); if (m <= 0) return 0; @@ -275,8 +261,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 4) Kokkos::abort("InnerLU<4>::serial_invoke, assert failure (m<=4)"); if (m <= 0) return 0; @@ -307,8 +292,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 3) Kokkos::abort("InnerLU<3>::serial_invoke, assert failure (m<=3)"); if (m <= 0) return 0; @@ -334,8 +318,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 2) Kokkos::abort("InnerLU<2>::serial_invoke, assert failure (m<=2)"); if (m <= 0) return 0; @@ -356,8 +339,7 @@ KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke( - const int m, ValueType *KOKKOS_RESTRICT A) { +KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A) { if (m > 1) Kokkos::abort("InnerLU<1>::serial_invoke, assert failure (m<=1)"); if (m <= 0) return 0; diff --git a/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp index 539980a705..04825ac61c 100644 --- a/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp @@ -30,19 +30,16 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], + a_43 = A[4 * _as0 + 3 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -87,17 +84,14 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -134,16 +128,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -173,9 +164,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_10 = A[1 * _as0 + 0 * _as1]; @@ -205,9 +195,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT /* A */, const int /* n */, - /**/ ValueType *KOKKOS_RESTRICT /* B */) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT /* A */, + const int /* n */, + /**/ ValueType *KOKKOS_RESTRICT /* B */) { return 0; } @@ -218,12 +208,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 5) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<5>::serial_invoke, assert failure (m<=5)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 5) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<5>::serial_invoke, assert failure (m<=5)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 5: { @@ -256,12 +244,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 4) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<4>::serial_invoke, assert failure (m<=4)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 4) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<4>::serial_invoke, assert failure (m<=4)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 4: { @@ -289,12 +275,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 3) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<3>::serial_invoke, assert failure (m<=3)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 3) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<3>::serial_invoke, assert failure (m<=3)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 3: { @@ -317,12 +301,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 2) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<2>::serial_invoke, assert failure (m<=2)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 2) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<2>::serial_invoke, assert failure (m<=2)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 2: { @@ -340,12 +322,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 1) - Kokkos::abort( - "InnerTrsmLeftLowerUnitDiag<1>::serial_invoke, assert failure (m<=1)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 1) Kokkos::abort("InnerTrsmLeftLowerUnitDiag<1>::serial_invoke, assert failure (m<=1)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 1: { @@ -364,16 +344,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], - a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], - a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1], + a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1], a_42 = A[4 * _as0 + 2 * _as1], + a_43 = A[4 * _as0 + 3 * _as1]; // const ValueType // a_00 = A[0*_as0+0*_as1], @@ -382,19 +361,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( // a_33 = A[3*_as0+3*_as1], // a_44 = A[4*_as0+4*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1], - inv_a_44 = - static_cast(1.0) / A[4 * _as0 + 4 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1], + inv_a_44 = static_cast(1.0) / A[4 * _as0 + 4 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -448,14 +421,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1], - a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1], + a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1]; // const ValueType // a_00 = A[0*_as0+0*_as1], @@ -463,17 +435,12 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( // a_22 = A[2*_as0+2*_as1], // a_33 = A[3*_as0+3*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -518,28 +485,23 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], - a_21 = A[2 * _as0 + 1 * _as1]; + const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1]; // const ValueType // a_00 = A[0*_as0+0*_as1], // a_11 = A[1*_as0+1*_as1], // a_22 = A[2*_as0+2*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -576,9 +538,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_10 = A[1 * _as0 + 0 * _as1]; @@ -587,10 +549,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( // a_00 = A[0*_as0+0*_as1], // a_11 = A[1*_as0+1*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1]; auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) { // load @@ -622,16 +582,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; // const ValueType // a_00 = A[0*_as0+0*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1]; auto trsv = [&](const int p, ValueType & /* b_0p */) { B[0 * _bs0 + p * _bs1] *= inv_a_00; /* b_0p /= a_00;*/ @@ -655,9 +614,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 5) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke, assert failure " @@ -694,9 +653,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 4) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke, assert failure " @@ -728,9 +687,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 3) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke, assert failure " @@ -757,9 +716,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 2) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke, assert failure " @@ -781,9 +740,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 1) Kokkos::abort( "InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke, assert failure " @@ -806,21 +765,17 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - /**/ a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + a_04 = A[0 * _as0 + 4 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], + /**/ a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], /**/ a_34 = A[3 * _as0 + 4 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -866,19 +821,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], /**/ a_23 = A[2 * _as0 + 3 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -916,16 +867,14 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], /**/ a_12 = A[1 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -956,9 +905,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1]; @@ -988,9 +936,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT /* A */, const int /* n */, - /**/ ValueType *KOKKOS_RESTRICT /* B */) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT /* A */, + const int /* n */, + /**/ ValueType *KOKKOS_RESTRICT /* B */) { return 0; } @@ -1001,12 +949,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 5) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<5>::serial_invoke, assert failure (m<=5)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 5) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<5>::serial_invoke, assert failure (m<=5)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 5: { @@ -1039,12 +985,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 4) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<4>::serial_invoke, assert failure (m<=4)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 4) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<4>::serial_invoke, assert failure (m<=4)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 4: { @@ -1072,12 +1016,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 3) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<3>::serial_invoke, assert failure (m<=3)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 3) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<3>::serial_invoke, assert failure (m<=3)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 3: { @@ -1100,12 +1042,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 2) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<2>::serial_invoke, assert failure (m<=2)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 2) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<2>::serial_invoke, assert failure (m<=2)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 2: { @@ -1123,12 +1063,10 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { - if (m > 1) - Kokkos::abort( - "InnerTrsmLeftUpperUnitDiag<1>::serial_invoke, assert failure (m<=1)"); +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { + if (m > 1) Kokkos::abort("InnerTrsmLeftUpperUnitDiag<1>::serial_invoke, assert failure (m<=1)"); if (m <= 0 || n <= 0) return 0; switch (m) { case 1: { @@ -1147,17 +1085,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], - /**/ a_23 = A[2 * _as0 + 3 * _as1], - a_24 = A[2 * _as0 + 4 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + a_04 = A[0 * _as0 + 4 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1], + /**/ a_23 = A[2 * _as0 + 3 * _as1], a_24 = A[2 * _as0 + 4 * _as1], /**/ a_34 = A[3 * _as0 + 4 * _as1]; // const ValueType @@ -1167,19 +1103,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( // a_33 = A[3*_as0+3*_as1], // a_44 = A[4*_as0+4*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1], - inv_a_44 = - static_cast(1.0) / A[4 * _as0 + 4 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1], + inv_a_44 = static_cast(1.0) / A[4 * _as0 + 4 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -1233,15 +1163,13 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; - const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], - a_03 = A[0 * _as0 + 3 * _as1], - /**/ a_12 = A[1 * _as0 + 2 * _as1], - a_13 = A[1 * _as0 + 3 * _as1], + const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1], + /**/ a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1], /**/ a_23 = A[2 * _as0 + 3 * _as1]; // const ValueType @@ -1250,17 +1178,12 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( // a_22 = A[2*_as0+2*_as1], // a_33 = A[3*_as0+3*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1], - inv_a_33 = - static_cast(1.0) / A[3 * _as0 + 3 * _as1]; - - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p, ValueType &b_3p) { + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1], + inv_a_33 = static_cast(1.0) / A[3 * _as0 + 3 * _as1]; + + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p, ValueType &b_3p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -1305,9 +1228,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1], @@ -1318,15 +1241,11 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( // a_11 = A[1*_as0+1*_as1], // a_22 = A[2*_as0+2*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1], - inv_a_22 = - static_cast(1.0) / A[2 * _as0 + 2 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1], + inv_a_22 = static_cast(1.0) / A[2 * _as0 + 2 * _as1]; - auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, - ValueType &b_2p) { + auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p, ValueType &b_2p) { // load b_0p = B[0 * _bs0 + p * _bs1]; b_1p = B[1 * _bs0 + p * _bs1]; @@ -1363,9 +1282,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; const ValueType a_01 = A[0 * _as0 + 1 * _as1]; @@ -1374,10 +1293,8 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( // a_00 = A[0*_as0+0*_as1], // a_11 = A[1*_as0+1*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1], - inv_a_11 = - static_cast(1.0) / A[1 * _as0 + 1 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1], + inv_a_11 = static_cast(1.0) / A[1 * _as0 + 1 * _as1]; auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) { // load @@ -1409,16 +1326,15 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (n <= 0) return 0; // const ValueType // a_00 = A[0*_as0+0*_as1]; - const ValueType inv_a_00 = - static_cast(1.0) / A[0 * _as0 + 0 * _as1]; + const ValueType inv_a_00 = static_cast(1.0) / A[0 * _as0 + 0 * _as1]; auto trsv = [&](const int p, ValueType & /* b_0p */) { // 0 iteration @@ -1443,9 +1359,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke( template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 5) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke, assert failure " @@ -1482,9 +1398,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 4) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke, assert failure " @@ -1516,9 +1432,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 3) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke, assert failure " @@ -1545,9 +1461,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 2) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke, assert failure " @@ -1569,9 +1485,9 @@ KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke( } template <> template -KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke( - const ValueType *KOKKOS_RESTRICT A, const int m, const int n, - /**/ ValueType *KOKKOS_RESTRICT B) { +KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke(const ValueType *KOKKOS_RESTRICT A, + const int m, const int n, + /**/ ValueType *KOKKOS_RESTRICT B) { if (m > 1) Kokkos::abort( "InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke, assert failure " diff --git a/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp index 070a620531..215c62e9f2 100644 --- a/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp @@ -32,49 +32,42 @@ namespace KokkosBatched { /// InverseLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template -KOKKOS_INLINE_FUNCTION int SerialInverseLU::invoke( - const AViewType &A, const WViewType &W) { +KOKKOS_INLINE_FUNCTION int SerialInverseLU::invoke(const AViewType &A, + const WViewType &W) { typedef typename AViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = A.extent(0), n = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); static_assert(AViewType::rank == 2, "A should have two dimensions"); static_assert(WViewType::rank == 1, "W should have one dimension"); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "A and W should be on the same memory space"); - static_assert(!std::is_same::value, + static_assert(!std::is_same::value, "W should be an contiguous 1D array"); assert(A.extent(0) * A.extent(1) * sizeof(typename AViewType::value_type) <= W.span() * sizeof(typename WViewType::value_type)); assert(m == n); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; int r_val = 0; if (A.stride(0) == 1) { - mkl_dgetrinp_compact( - MKL_COL_MAJOR, n, (double *)A.data(), A.stride(1), (double *)W.data(), - (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, - format, (MKL_INT)vector_type::vector_length); + mkl_dgetrinp_compact(MKL_COL_MAJOR, n, (double *)A.data(), A.stride(1), (double *)W.data(), + (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, format, + (MKL_INT)vector_type::vector_length); } else if (A.stride(1) == 1) { - mkl_dgetrinp_compact( - MKL_ROW_MAJOR, n, (double *)A.data(), A.stride(0), (double *)W.data(), - (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, - format, (MKL_INT)vector_type::vector_length); + mkl_dgetrinp_compact(MKL_ROW_MAJOR, n, (double *)A.data(), A.stride(0), (double *)W.data(), + (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val, format, + (MKL_INT)vector_type::vector_length); } else { r_val = -1; } diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp index 2fa372aa7c..e2acd012cb 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp @@ -31,35 +31,28 @@ namespace KokkosBatched { /// SerialLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template KOKKOS_INLINE_FUNCTION int SerialLU::invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny) { + const AViewType &A, const typename MagnitudeScalarType::type tiny) { typedef typename AViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = A.extent(0), n = A.extent(1); static_assert(is_vector::value, "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; int r_val = 0; if (A.stride_0() == 1) { - mkl_dgetrfnp_compact(MKL_COL_MAJOR, m, n, (double *)A.data(), A.stride_1(), - (MKL_INT *)&r_val, format, + mkl_dgetrfnp_compact(MKL_COL_MAJOR, m, n, (double *)A.data(), A.stride_1(), (MKL_INT *)&r_val, format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { - mkl_dgetrfnp_compact(MKL_ROW_MAJOR, m, n, (double *)A.data(), A.stride_0(), - (MKL_INT *)&r_val, format, + mkl_dgetrfnp_compact(MKL_ROW_MAJOR, m, n, (double *)A.data(), A.stride_0(), (MKL_INT *)&r_val, format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -71,21 +64,17 @@ KOKKOS_INLINE_FUNCTION int SerialLU::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialLU::invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny) { - return SerialLU_Internal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), tiny); + const AViewType &A, const typename MagnitudeScalarType::type tiny) { + return SerialLU_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), + tiny); } template <> template KOKKOS_INLINE_FUNCTION int SerialLU::invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny) { - return SerialLU_Internal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), tiny); + const AViewType &A, const typename MagnitudeScalarType::type tiny) { + return SerialLU_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), + tiny); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp index e6b34d8f1b..6555a16d93 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp @@ -33,16 +33,15 @@ namespace KokkosBatched { template struct SerialLU_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const typename MagnitudeScalarType::type tiny); + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const typename MagnitudeScalarType::type tiny); }; template <> template KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( - const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const typename MagnitudeScalarType::type tiny) { + const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const typename MagnitudeScalarType::type tiny) { const int k = (m < n ? m : n); if (k <= 0) return 0; @@ -55,14 +54,12 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( const ValueType *KOKKOS_RESTRICT a12t = A + (p)*as0 + (p + 1) * as1; - ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, - *KOKKOS_RESTRICT A22 = - A + (p + 1) * as0 + (p + 1) * as1; + ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, + *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; if (tiny != 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; - const auto alpha11_real = - Kokkos::ArithTraits::real(alpha11_reference); + const auto alpha11_real = Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } @@ -76,8 +73,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < jend; ++j) - A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; + for (int j = 0; j < jend; ++j) A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; } } return 0; @@ -86,8 +82,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( - const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const typename MagnitudeScalarType::type /*tiny*/) { constexpr int mbAlgo = Algo::LU::Blocked::mb(); const typename MagnitudeScalarType::type one(1.0), minus_one(-1.0); @@ -100,8 +95,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( InnerTrsmLeftLowerUnitDiag trsm_llu(as0, as1, as0, as1); InnerTrsmLeftLowerNonUnitDiag trsm_run(as1, as0, as1, as0); - auto lu_factorize = [&](const int ib, const int jb, - ValueType *KOKKOS_RESTRICT AA) { + auto lu_factorize = [&](const int ib, const int jb, ValueType *KOKKOS_RESTRICT AA) { const int mb = mbAlgo; const int kb = ib < jb ? ib : jb; for (int p = 0; p < kb; p += mb) { @@ -121,9 +115,8 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( trsm_run.serial_invoke(Ap, pb, m_abr, Ap + mb * as0); // gemm update - SerialGemmInternal::invoke( - m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, Ap + mb * as1, - as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); + SerialGemmInternal::invoke(m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, + Ap + mb * as1, as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); } }; diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp index 3f28c063b8..9ed5e244d2 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp @@ -36,11 +36,9 @@ struct TeamLU { template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0) { - return TeamLU_Internal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), - tiny); + const typename MagnitudeScalarType::type tiny = 0) { + return TeamLU_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), tiny); } }; @@ -49,11 +47,9 @@ struct TeamLU { template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0) { - return TeamLU_Internal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), - tiny); + const typename MagnitudeScalarType::type tiny = 0) { + return TeamLU_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), tiny); } }; diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp index cbc811de5e..dacfb02ed4 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp @@ -35,17 +35,15 @@ namespace KokkosBatched { template struct TeamLU_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const typename MagnitudeScalarType::type tiny); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const typename MagnitudeScalarType::type tiny); }; template <> template KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( - const MemberType &member, const int m, const int n, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const MemberType &member, const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const typename MagnitudeScalarType::type tiny) { const int k = (m < n ? m : n); if (k <= 0) return 0; @@ -60,15 +58,13 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( const ValueType *KOKKOS_RESTRICT a12t = A + (p)*as0 + (p + 1) * as1; - ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, - *KOKKOS_RESTRICT A22 = - A + (p + 1) * as0 + (p + 1) * as1; + ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1, + *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; if (tiny != 0) { if (member.team_rank() == 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; - const auto alpha11_real = - Kokkos::ArithTraits::real(alpha11_reference); + const auto alpha11_real = Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } @@ -76,19 +72,17 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( member.team_barrier(); const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), - [&](const int &i) { - // a21[i*as0] *= inv_alpha11; - a21[i * as0] /= alpha11; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), [&](const int &i) { + // a21[i*as0] *= inv_alpha11; + a21[i * as0] /= alpha11; + }); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { - // assume layout right for batched computation - const int i = ij / jend, j = ij % jend; - A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { + // assume layout right for batched computation + const int i = ij / jend, j = ij % jend; + A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1]; + }); } return 0; } @@ -96,8 +90,7 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( - const MemberType &member, const int m, const int n, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + const MemberType &member, const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, const typename MagnitudeScalarType::type /*tiny*/) { constexpr int mbAlgo = Algo::LU::Blocked::mb(); @@ -110,15 +103,11 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( InnerTrsmLeftLowerUnitDiag trsm_llu(as0, as1, as0, as1); InnerTrsmLeftLowerNonUnitDiag trsm_run(as1, as0, as1, as0); - auto lu_factorize = [&](const int ib, const int jb, - ValueType *KOKKOS_RESTRICT AA) { + auto lu_factorize = [&](const int ib, const int jb, ValueType *KOKKOS_RESTRICT AA) { const int tsize = member.team_size(); // Made this non-const in order to WORKAROUND issue #349 int mb = mbAlgo; - int nb = ((jb - mb) + (ib - mb)) > 0 - ? ((jb - mb) + (ib - mb)) / tsize + - (((jb - mb) + (ib - mb)) % tsize > 0) - : 1; + int nb = ((jb - mb) + (ib - mb)) > 0 ? ((jb - mb) + (ib - mb)) / tsize + (((jb - mb) + (ib - mb)) % tsize > 0) : 1; const int kb = ib < jb ? ib : jb; for (int p = 0; p < kb; p += mb) { @@ -133,29 +122,24 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( member.team_barrier(); // Made this non-const in order to WORKAROUND issue #349 - int m_abr = ib - p - mb, n_abr = jb - p - mb, mp_abr = m_abr % nb, - np_abr = n_abr % nb, mq_abr = (m_abr / nb) + (mp_abr > 0), - nq_abr = (n_abr / nb) + (np_abr > 0); + int m_abr = ib - p - mb, n_abr = jb - p - mb, mp_abr = m_abr % nb, np_abr = n_abr % nb, + mq_abr = (m_abr / nb) + (mp_abr > 0), nq_abr = (n_abr / nb) + (np_abr > 0); // trsm update - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, mq_abr + nq_abr), - [&](const int &ij) { - if (ij < nq_abr) { - const int j = (ij)*nb, qb = (j + nb) > n_abr ? np_abr : nb; - trsm_llu.serial_invoke(Ap, pb, qb, Ap + (j + mb) * as1); - } else { - const int i = (ij - nq_abr) * nb, - qb = (i + nb) > m_abr ? mp_abr : nb; - trsm_run.serial_invoke(Ap, pb, qb, Ap + (i + mb) * as0); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, mq_abr + nq_abr), [&](const int &ij) { + if (ij < nq_abr) { + const int j = (ij)*nb, qb = (j + nb) > n_abr ? np_abr : nb; + trsm_llu.serial_invoke(Ap, pb, qb, Ap + (j + mb) * as1); + } else { + const int i = (ij - nq_abr) * nb, qb = (i + nb) > m_abr ? mp_abr : nb; + trsm_run.serial_invoke(Ap, pb, qb, Ap + (i + mb) * as0); + } + }); member.team_barrier(); // gemm update - TeamGemmInternal::invoke( - member, m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, - Ap + mb * as1, as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); + TeamGemmInternal::invoke(member, m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, + Ap + mb * as1, as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1); } }; diff --git a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp index ea87217a37..c266d65c54 100644 --- a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp @@ -45,12 +45,9 @@ struct SerialLeftEigenvectorFromSchurInternal { /// contiguous workspace that can hold complex array (m) template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ ValueType *S, const int ss0, - const int ss1, - /* */ ValueType *V, const int vs0, - const int vs1, - /* */ ValueType *w, - const int *blks) { + /* */ ValueType *S, const int ss0, const int ss1, + /* */ ValueType *V, const int vs0, const int vs1, + /* */ ValueType *w, const int *blks) { typedef ValueType value_type; typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; @@ -77,8 +74,7 @@ struct SerialLeftEigenvectorFromSchurInternal { for (; m_stl < (m - 1);) { /// part 2x2 into 3x3 const int mA11 = blks[m_stl]; - assert(((mA11 == 1) || (mA11 == 2)) && - "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2"); + assert(((mA11 == 1) || (mA11 == 2)) && "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2"); S_part3x3.partWithABR(S_part2x2, mA11, mA11); V_part3x1.partWithAB(V_part2x1, mA11); @@ -90,23 +86,19 @@ struct SerialLeftEigenvectorFromSchurInternal { /// initialize a left hand side b[m_stl] = one; - for (int j = 0; j < (m - m_stl_plus_mA11); ++j) - b[j + m_stl_plus_mA11] = -S_part3x3.A12[j * ss1]; + for (int j = 0; j < (m - m_stl_plus_mA11); ++j) b[j + m_stl_plus_mA11] = -S_part3x3.A12[j * ss1]; /// perform shifted trsv (transposed) - SerialShiftedTrsvInternalLower::invoke( - m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, - b + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); + SerialShiftedTrsvInternalLower::invoke(m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, + b + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); /// copy back to V (row wise copy) for (int j = 0; j < m_stl; ++j) V_part3x1.A1[j * vs1] = zero; for (int j = m_stl; j < m; ++j) V_part3x1.A1[j * vs1] = b[j]; } else { /// complex eigen pair - const value_type alpha11 = S_part3x3.A11[0], - alpha12 = S_part3x3.A11[ss1], - alpha21 = S_part3x3.A11[ss0], - beta = ats::sqrt(-alpha12 * alpha21); + const value_type alpha11 = S_part3x3.A11[0], alpha12 = S_part3x3.A11[ss1], alpha21 = S_part3x3.A11[ss0], + beta = ats::sqrt(-alpha12 * alpha21); const complex_type lambda(alpha11, beta); complex_type *bc = (complex_type *)(b); @@ -118,13 +110,11 @@ struct SerialLeftEigenvectorFromSchurInternal { const value_type *S_A12_a = S_part3x3.A12; const value_type *S_A12_b = S_part3x3.A12 + ss0; for (int j = 0; j < (m - m_stl_plus_mA11); ++j) - bc[j + m_stl_plus_mA11] = complex_type(-S_A12_a[j * ss1] * beta, - S_A12_b[j * ss1] * alpha12); + bc[j + m_stl_plus_mA11] = complex_type(-S_A12_a[j * ss1] * beta, S_A12_b[j * ss1] * alpha12); /// perform shifted trsv - SerialShiftedTrsvInternalLower::invoke( - m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, - bc + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); + SerialShiftedTrsvInternalLower::invoke(m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0, + bc + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11); /// copy back to V value_type *V_A1_r = V_part3x1.A1; diff --git a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp index 42adf8eeba..af6832940b 100644 --- a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp @@ -28,8 +28,7 @@ namespace KokkosBatched { struct SerialNormalizeInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ ValueType *KOKKOS_RESTRICT v, - const int vs) { + /* */ ValueType *KOKKOS_RESTRICT v, const int vs) { typedef ValueType value_type; typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; @@ -53,10 +52,8 @@ struct SerialNormalizeInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ RealType *KOKKOS_RESTRICT vr, - const int vrs, - /* */ RealType *KOKKOS_RESTRICT vi, - const int vis) { + /* */ RealType *KOKKOS_RESTRICT vr, const int vrs, + /* */ RealType *KOKKOS_RESTRICT vi, const int vis) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp index b0ea39fa3f..b96c47e642 100644 --- a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp @@ -24,17 +24,13 @@ namespace KokkosBatched { template -KOKKOS_INLINE_FUNCTION static int checkPttrfInput( - [[maybe_unused]] const DViewType &d, [[maybe_unused]] const EViewType &e) { - static_assert(Kokkos::is_view::value, - "KokkosBatched::pttrf: DViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::pttrf: EViewType is not a Kokkos::View."); +KOKKOS_INLINE_FUNCTION static int checkPttrfInput([[maybe_unused]] const DViewType &d, + [[maybe_unused]] const EViewType &e) { + static_assert(Kokkos::is_view::value, "KokkosBatched::pttrf: DViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::pttrf: EViewType is not a Kokkos::View."); - static_assert(DViewType::rank == 1, - "KokkosBatched::pttrf: DViewType must have rank 1."); - static_assert(EViewType::rank == 1, - "KokkosBatched::pttrf: EViewType must have rank 1."); + static_assert(DViewType::rank == 1, "KokkosBatched::pttrf: DViewType must have rank 1."); + static_assert(EViewType::rank == 1, "KokkosBatched::pttrf: EViewType must have rank 1."); #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) const int nd = d.extent(0); @@ -55,8 +51,7 @@ KOKKOS_INLINE_FUNCTION static int checkPttrfInput( template <> struct SerialPttrf { template - KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, - const EViewType &e) { + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e) { // Quick return if possible if (d.extent(0) == 0) return 0; if (d.extent(0) == 1) return (d(0) < 0 ? 1 : 0); @@ -64,8 +59,8 @@ struct SerialPttrf { auto info = checkPttrfInput(d, e); if (info) return info; - return SerialPttrfInternal::invoke( - d.extent(0), d.data(), d.stride(0), e.data(), e.stride(0)); + return SerialPttrfInternal::invoke(d.extent(0), d.data(), d.stride(0), e.data(), + e.stride(0)); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp index 5b4d3fb182..438ec43320 100644 --- a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp @@ -25,16 +25,12 @@ namespace KokkosBatched { template struct SerialPttrfInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int n, - ValueType *KOKKOS_RESTRICT d, - const int ds0, - ValueType *KOKKOS_RESTRICT e, - const int es0); + KOKKOS_INLINE_FUNCTION static int invoke(const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + ValueType *KOKKOS_RESTRICT e, const int es0); template - KOKKOS_INLINE_FUNCTION static int invoke( - const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, - Kokkos::complex *KOKKOS_RESTRICT e, const int es0); + KOKKOS_INLINE_FUNCTION static int invoke(const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0); }; /// @@ -44,8 +40,7 @@ struct SerialPttrfInternal { template <> template KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( - const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, - ValueType *KOKKOS_RESTRICT e, const int es0) { + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, ValueType *KOKKOS_RESTRICT e, const int es0) { int info = 0; auto update = [&](const int i) { @@ -54,9 +49,7 @@ KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( d[(i + 1) * ds0] -= e[i * es0] * ei_tmp; }; - auto check_positive_definitiveness = [&](const int i) { - return (d[i] <= 0.0) ? (i + 1) : 0; - }; + auto check_positive_definitiveness = [&](const int i) { return (d[i] <= 0.0) ? (i + 1) : 0; }; // Compute the L*D*L' (or U'*D*U) factorization of A. const int i4 = (n - 1) % 4; @@ -127,8 +120,8 @@ KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( template <> template KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( - const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, - Kokkos::complex *KOKKOS_RESTRICT e, const int es0) { + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, Kokkos::complex *KOKKOS_RESTRICT e, + const int es0) { int info = 0; auto update = [&](const int i) { @@ -140,9 +133,7 @@ KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( d[(i + 1) * ds0] = d[(i + 1) * ds0] - f_tmp * eir_tmp - g_tmp * eii_tmp; }; - auto check_positive_definitiveness = [&](const int i) { - return (d[i] <= 0.0) ? (i + 1) : 0; - }; + auto check_positive_definitiveness = [&](const int i) { return (d[i] <= 0.0) ? (i + 1) : 0; }; // Compute the L*D*L' (or U'*D*U) factorization of A. const int i4 = (n - 1) % 4; diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index ac97a3f772..7c717c2eed 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -34,13 +34,10 @@ namespace KokkosBatched { struct SerialQR_FormQ_Internal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int k, - /* */ ValueType* A, const int as0, - const int as1, + /* */ ValueType* A, const int as0, const int as1, /* */ ValueType* t, const int ts, - /* */ ValueType* Q, const int qs0, - const int qs1, - /* */ ValueType* w, - const bool is_Q_zero = false) { + /* */ ValueType* Q, const int qs0, const int qs1, + /* */ ValueType* w, const bool is_Q_zero = false) { typedef ValueType value_type; /// Given a matrix A that includes QR factorization @@ -57,8 +54,7 @@ struct SerialQR_FormQ_Internal { else SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); - return SerialApplyQ_LeftNoTransForwardInternal ::invoke( - m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); + return SerialApplyQ_LeftNoTransForwardInternal ::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index 66b63f23f6..af7f458898 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -33,12 +33,11 @@ namespace KokkosBatched { /// struct TeamVectorQR_FormQ_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, const int k, - /* */ ValueType *A, const int as0, const int as1, - /* */ ValueType *t, const int ts, - /* */ ValueType *Q, const int qs0, const int qs1, - /* */ ValueType *w, const bool is_Q_zero = false) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int k, + /* */ ValueType *A, const int as0, const int as1, + /* */ ValueType *t, const int ts, + /* */ ValueType *Q, const int qs0, const int qs1, + /* */ ValueType *w, const bool is_Q_zero = false) { typedef ValueType value_type; /// Given a matrix A that includes QR factorization @@ -51,14 +50,12 @@ struct TeamVectorQR_FormQ_Internal { // set identity if (is_Q_zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, value_type(1), - Q, qs0 + qs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0 + qs1); else TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1); member.team_barrier(); - return TeamVectorApplyQ_LeftForwardInternal ::invoke( - member, m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, w); + return TeamVectorApplyQ_LeftForwardInternal ::invoke(member, m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, w); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp index 5eac699f56..1083e6af2a 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp @@ -29,10 +29,9 @@ namespace KokkosBatched { template <> template -KOKKOS_INLINE_FUNCTION int SerialQR::invoke( - const AViewType &A, const tViewType &t, const wViewType &w) { - return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1(), t.data(), +KOKKOS_INLINE_FUNCTION int SerialQR::invoke(const AViewType &A, const tViewType &t, + const wViewType &w) { + return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(), t.stride_0(), w.data()); } diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 729604f6c3..95ca1c4340 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -34,8 +34,7 @@ struct SerialQR_Internal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, /* */ ValueType *w) { typedef ValueType value_type; @@ -66,13 +65,11 @@ struct SerialQR_Internal { /// ----------------------------------------------------- // perform householder transformation - SerialLeftHouseholderInternal::invoke(m_A22, A_part3x3.A11, A_part3x3.A21, - as0, tau); + SerialLeftHouseholderInternal::invoke(m_A22, A_part3x3.A11, A_part3x3.A21, as0, tau); // left apply householder to A22 - SerialApplyLeftHouseholderInternal::invoke( - m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, + A_part3x3.A22, as0, as1, w); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp index 78d6e226a8..2497e5adf5 100644 --- a/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp @@ -30,12 +30,9 @@ namespace KokkosBatched { template struct TeamVectorQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const wViewType &w) { - return TeamVectorQR_Internal::invoke(member, A.extent(0), A.extent(1), - A.data(), A.stride_0(), A.stride_1(), + return TeamVectorQR_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(), t.stride_0(), w.data()); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp index 312feba997..e3dde67986 100644 --- a/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp @@ -35,8 +35,7 @@ struct TeamVectorQR_Internal { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, /* */ ValueType *w) { typedef ValueType value_type; @@ -67,14 +66,12 @@ struct TeamVectorQR_Internal { /// ----------------------------------------------------- // perform householder transformation - TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, - A_part3x3.A21, as0, tau); + TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, A_part3x3.A21, as0, tau); member.team_barrier(); // left apply householder to A22 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, + A_part3x3.A22, as0, as1, w); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp index 4f293f12cf..ed9ccd8cce 100644 --- a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp @@ -29,17 +29,13 @@ namespace KokkosBatched { template struct TeamVectorQR_WithColumnPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const pViewType &p, - const wViewType &w, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const pViewType &p, const wViewType &w, /* */ int &matrix_rank) { - return TeamVectorQR_WithColumnPivotingInternal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), - t.data(), t.stride_0(), p.data(), p.stride_0(), w.data(), matrix_rank); + return TeamVectorQR_WithColumnPivotingInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), + A.stride_1(), t.data(), t.stride_0(), p.data(), p.stride_0(), + w.data(), matrix_rank); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp index 26efb70c77..280bfa434b 100644 --- a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp @@ -37,10 +37,9 @@ namespace KokkosBatched { /// struct TeamVectorUpdateColumnNormsInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int n, const ValueType *KOKKOS_RESTRICT a, - const int as0, - /* */ ValueType *KOKKOS_RESTRICT norm, const int ns0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int n, const ValueType *KOKKOS_RESTRICT a, + const int as0, + /* */ ValueType *KOKKOS_RESTRICT norm, const int ns0) { using ats = Kokkos::ArithTraits; Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { const int idx_a = j * as0, idx_n = j * ns0; @@ -55,8 +54,7 @@ struct TeamVectorQR_WithColumnPivotingInternal { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, // m = NumRows(A) const int n, // n = NumCols(A) - /* */ ValueType *A, const int as0, - const int as1, + /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts0, /* */ IntType *p, const int ps0, /* */ ValueType *w, @@ -98,8 +96,7 @@ struct TeamVectorQR_WithColumnPivotingInternal { norm_part1x2.partWithAL(norm, n, 0); // compute initial column norms (replaced by dot product) - TeamVectorDotInternal::invoke(member, m, n, A, as0, as1, A, as0, as1, norm, - 1); + TeamVectorDotInternal::invoke(member, m, n, A, as0, as1, A, as0, as1, norm, 1); member.team_barrier(); const bool finish_when_rank_found = (matrix_rank == -1); @@ -124,33 +121,27 @@ struct TeamVectorQR_WithColumnPivotingInternal { /// ----------------------------------------------------- // find max location - TeamVectorFindAmaxInternal::invoke(member, n_AR, norm_part1x2.AR, 1, - pividx); + TeamVectorFindAmaxInternal::invoke(member, n_AR, norm_part1x2.AR, 1, pividx); member.team_barrier(); // apply pivot - TeamVectorApplyPivotVectorForwardInternal::invoke(member, *pividx, - norm_part1x2.AR, 1); - TeamVectorApplyPivotMatrixForwardInternal::invoke( - member, m, *pividx, A_part2x2.ATR, as1, as0); + TeamVectorApplyPivotVectorForwardInternal::invoke(member, *pividx, norm_part1x2.AR, 1); + TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, *pividx, A_part2x2.ATR, as1, as0); member.team_barrier(); // perform householder transformation - TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, - A_part3x3.A21, as0, tau); + TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11, A_part3x3.A21, as0, tau); member.team_barrier(); // left apply householder to A22 - TeamVectorApplyLeftHouseholderInternal::invoke( - member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + TeamVectorApplyLeftHouseholderInternal::invoke(member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, + A_part3x3.A22, as0, as1, w); member.team_barrier(); // break condition if (matrix_rank == min_mn) { if (m_atl == 0) max_diag = ats::abs(A[0]); - const value_type val_diag = ats::abs(A_part3x3.A11[0]), - threshold(10 * max_diag * ats::epsilon()); + const value_type val_diag = ats::abs(A_part3x3.A11[0]), threshold(10 * max_diag * ats::epsilon()); if (val_diag < threshold) { matrix_rank = m_atl; if (finish_when_rank_found) break; @@ -158,8 +149,7 @@ struct TeamVectorQR_WithColumnPivotingInternal { } // norm update - TeamVectorUpdateColumnNormsInternal::invoke(member, n_A22, A_part3x3.A12, - as1, norm_part1x3.A2, 1); + TeamVectorUpdateColumnNormsInternal::invoke(member, n_A22, A_part3x3.A12, as1, norm_part1x3.A2, 1); member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); diff --git a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp index 4716506064..029875f810 100644 --- a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp @@ -45,12 +45,9 @@ struct SerialRightEigenvectorFromSchurInternal { /// contiguous workspace that can hold complex array (m) template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ ValueType *S, const int ss0, - const int ss1, - /* */ ValueType *V, const int vs0, - const int vs1, - /* */ ValueType *w, - const int *blks) { + /* */ ValueType *S, const int ss0, const int ss1, + /* */ ValueType *V, const int vs0, const int vs1, + /* */ ValueType *w, const int *blks) { typedef ValueType value_type; typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; @@ -78,8 +75,7 @@ struct SerialRightEigenvectorFromSchurInternal { for (; m_stl > 0;) { /// part 2x2 into 3x3 const int mA11 = blks[m_stl - 1]; - assert(((mA11 == 1) || (mA11 == 2)) && - "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2"); + assert(((mA11 == 1) || (mA11 == 2)) && "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2"); S_part3x3.partWithATL(S_part2x2, mA11, mA11); V_part1x3.partWithAL(V_part1x2, mA11); @@ -90,23 +86,19 @@ struct SerialRightEigenvectorFromSchurInternal { const value_type lambda = *S_part3x3.A11; /// initialize a right eigen vector - for (int i = 0; i < m_stl_minus_mA11; ++i) - b[i] = -S_part3x3.A01[i * ss0]; + for (int i = 0; i < m_stl_minus_mA11; ++i) b[i] = -S_part3x3.A01[i * ss0]; b[m_stl - 1] = one; /// perform shifted trsv - SerialShiftedTrsvInternalUpper::invoke( - m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, w, 1, blks); + SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, w, 1, blks); /// copy back to V for (int i = 0; i < m_stl; ++i) V_part1x3.A1[i * vs0] = w[i]; for (int i = m_stl; i < m; ++i) V_part1x3.A1[i * vs0] = zero; } else { /// complex eigen pair - const value_type alpha11 = S_part3x3.A11[0], - alpha12 = S_part3x3.A11[ss1], - alpha21 = S_part3x3.A11[ss0], - beta = ats::sqrt(-alpha12 * alpha21); + const value_type alpha11 = S_part3x3.A11[0], alpha12 = S_part3x3.A11[ss1], alpha21 = S_part3x3.A11[ss0], + beta = ats::sqrt(-alpha12 * alpha21); const complex_type lambda(alpha11, beta); complex_type *bc = (complex_type *)(b); @@ -115,14 +107,12 @@ struct SerialRightEigenvectorFromSchurInternal { const value_type *S_A01_a = S_part3x3.A01; const value_type *S_A01_b = S_part3x3.A01 + ss1; for (int i = 0; i < m_stl_minus_mA11; ++i) - bc[i] = complex_type(-S_A01_a[i * ss0] * beta, - S_A01_b[i * ss0] * alpha21); + bc[i] = complex_type(-S_A01_a[i * ss0] * beta, S_A01_b[i * ss0] * alpha21); bc[m_stl - 2] = complex_type(beta, zero); bc[m_stl - 1] = complex_type(zero, -alpha21); /// perform shifted trsv - SerialShiftedTrsvInternalUpper::invoke( - m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, bc, 1, blks); + SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, bc, 1, blks); /// copy back to V value_type *V_A1_r = V_part1x3.A1; diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp index a2c345f4fb..e0c25c2ce7 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp @@ -22,50 +22,36 @@ namespace KokkosBatched { // Version which computes the full factorization -template -KOKKOS_INLINE_FUNCTION int SerialSVD::invoke( - SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &sigma, - const VViewType &Vt, const WViewType &work, - typename AViewType::const_value_type tol) { - static_assert(Kokkos::is_view_v && AViewType::rank == 2, - "SVD: A must be a rank-2 view"); - static_assert(Kokkos::is_view_v && UViewType::rank == 2, - "SVD: U must be a rank-2 view"); - static_assert(Kokkos::is_view_v && SViewType::rank == 1, - "SVD: s must be a rank-1 view"); - static_assert(Kokkos::is_view_v && VViewType::rank == 2, - "SVD: V must be a rank-2 view"); - static_assert(Kokkos::is_view_v && WViewType::rank == 1, - "SVD: W must be a rank-1 view"); - static_assert( - !std::is_same_v, - "SVD: W must be contiguous (not LayoutStride)"); +template +KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_USV_Tag, const AViewType &A, const UViewType &U, + const SViewType &sigma, const VViewType &Vt, const WViewType &work, + typename AViewType::const_value_type tol) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && UViewType::rank == 2, "SVD: U must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && VViewType::rank == 2, "SVD: V must be a rank-2 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, "SVD: W must be a rank-1 view"); + static_assert(!std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; return KokkosBatched::SerialSVDInternal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(), - U.stride(0), U.stride(1), Vt.data(), Vt.stride(0), Vt.stride(1), - sigma.data(), sigma.stride(0), work.data(), tol); + A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(), U.stride(0), U.stride(1), Vt.data(), + Vt.stride(0), Vt.stride(1), sigma.data(), sigma.stride(0), work.data(), tol); } // Version which computes only singular values template -KOKKOS_INLINE_FUNCTION int SerialSVD::invoke( - SVD_S_Tag, const AViewType &A, const SViewType &sigma, - const WViewType &work, typename AViewType::const_value_type tol) { - static_assert(Kokkos::is_view_v && AViewType::rank == 2, - "SVD: A must be a rank-2 view"); - static_assert(Kokkos::is_view_v && SViewType::rank == 1, - "SVD: s must be a rank-1 view"); - static_assert(Kokkos::is_view_v && WViewType::rank == 1, - "SVD: W must be a rank-1 view"); - static_assert( - !std::is_same_v, - "SVD: W must be contiguous (not LayoutStride)"); +KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_S_Tag, const AViewType &A, const SViewType &sigma, + const WViewType &work, typename AViewType::const_value_type tol) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, "SVD: W must be a rank-1 view"); + static_assert(!std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; - return KokkosBatched::SerialSVDInternal::invoke( - A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), nullptr, 0, - 0, nullptr, 0, 0, sigma.data(), sigma.stride(0), work.data(), tol); + return KokkosBatched::SerialSVDInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride(0), + A.stride(1), nullptr, 0, 0, nullptr, 0, 0, sigma.data(), + sigma.stride(0), work.data(), tol); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 87ed65d81e..0b85b1e28e 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -49,8 +49,7 @@ struct SerialSVDInternal { // however this is simpler because it exploits the symmetric structure, and // the realness of the eigenvalues. template - KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, - value_type a22, value_type& e1, + KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, value_type a22, value_type& e1, value_type& e2) { value_type a = Kokkos::ArithTraits::one(); value_type b = -a11 - a22; @@ -67,10 +66,8 @@ struct SerialSVDInternal { // // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n template - KOKKOS_INLINE_FUNCTION static void svdStep(value_type* B, value_type* U, - value_type* Vt, int um, int vn, - int n, int Bs0, int Bs1, int Us0, - int Us1, int Vts0, int Vts1) { + KOKKOS_INLINE_FUNCTION static void svdStep(value_type* B, value_type* U, value_type* Vt, int um, int vn, int n, + int Bs0, int Bs1, int Us0, int Us1, int Vts0, int Vts1) { using KAT = Kokkos::ArithTraits; // Compute the eigenvalues of trailing 2x2 value_type dn = SVDIND(B, n - 1, n - 1); @@ -91,34 +88,30 @@ struct SerialSVDInternal { // Use Givens to zero out z in [y; z] Kokkos::pair G; value_type discard; // Don't actually write [alpha; 0] anywhere - KokkosBatched::SerialGivensInternal::invoke(y, z, &G, - &discard); + KokkosBatched::SerialGivensInternal::invoke(y, z, &G, &discard); // apply the Givens transformation to B on the right, to columns k,k+1 // B := BG(k, k+1, theta) int minrow = KOKKOSKERNELS_MACRO_MAX(0, k - 1); int maxrow = KOKKOSKERNELS_MACRO_MIN(n, k + 2); - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, maxrow - minrow, &SVDIND(B, minrow, k + 1), Bs0, - &SVDIND(B, minrow, k), Bs0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, maxrow - minrow, &SVDIND(B, minrow, k + 1), + Bs0, &SVDIND(B, minrow, k), Bs0); if (Vt) { - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, vn, &SVDIND(Vt, k + 1, 0), Vts1, &SVDIND(Vt, k, 0), Vts1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, vn, &SVDIND(Vt, k + 1, 0), Vts1, + &SVDIND(Vt, k, 0), Vts1); } y = SVDIND(B, k, k); z = SVDIND(B, k + 1, k); - KokkosBatched::SerialGivensInternal::invoke(y, z, &G, - &SVDIND(B, k, k)); + KokkosBatched::SerialGivensInternal::invoke(y, z, &G, &SVDIND(B, k, k)); SVDIND(B, k + 1, k) = KAT::zero(); int mincol = k + 1; int maxcol = KOKKOSKERNELS_MACRO_MIN(n, k + 3); // apply Givens transformation to B on the left, to rows k, k + 1 // B := G(k, k+1, theta)^T * B - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, maxcol - mincol, &SVDIND(B, k + 1, mincol), Bs1, - &SVDIND(B, k, mincol), Bs1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, maxcol - mincol, &SVDIND(B, k + 1, mincol), + Bs1, &SVDIND(B, k, mincol), Bs1); if (U) { - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, um, &SVDIND(U, 0, k + 1), Us0, &SVDIND(U, 0, k), Us0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, um, &SVDIND(U, 0, k + 1), Us0, + &SVDIND(U, 0, k), Us0); } if (k < n - 2) { y = SVDIND(B, k, k + 1); @@ -131,71 +124,65 @@ struct SerialSVDInternal { // Assumes i is not the last row. // U is m*m, B is n*n template - KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, - int Bs0, int Bs1, value_type* U, - int m, int Us0, int Us1) { + KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, int Bs0, int Bs1, value_type* U, int m, + int Us0, int Us1) { Kokkos::pair G; for (int j = i + 1; j < n; j++) { // Zero out B(i, j) against diagonal j, introducing nonzero in B(i, j + 1) - KokkosBatched::SerialGivensInternal::invoke( - SVDIND(B, j, j), SVDIND(B, i, j), &G, &SVDIND(B, j, j)); + KokkosBatched::SerialGivensInternal::invoke(SVDIND(B, j, j), SVDIND(B, i, j), &G, &SVDIND(B, j, j)); SVDIND(B, i, j) = Kokkos::ArithTraits::zero(); // Now, only need to apply givens to a single column (if not already at // the end), introducing the next nonzero if (j < n - 1) { - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, 1, &SVDIND(B, i, j + 1), Bs1, &SVDIND(B, j, j + 1), Bs1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, 1, &SVDIND(B, i, j + 1), Bs1, + &SVDIND(B, j, j + 1), Bs1); } if (U) { - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), Us0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), + Us0); } } } template - KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, - int Bs0, int Bs1, - value_type* Vt, int Vts0, + KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, int Bs0, int Bs1, value_type* Vt, int Vts0, int Vts1) { // Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the // last column. Kokkos::pair G; for (int j = n - 2; j >= 0; j--) { - KokkosBatched::SerialGivensInternal::invoke( - SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, &SVDIND(B, j, j)); + KokkosBatched::SerialGivensInternal::invoke(SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, + &SVDIND(B, j, j)); SVDIND(B, j, n - 1) = Kokkos::ArithTraits::zero(); if (j != 0) { - KokkosBatched::SerialApplyRightGivensInternal::invoke( - G, 1, &SVDIND(B, j - 1, n - 1), Bs0, &SVDIND(B, j - 1, j), Bs0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, 1, &SVDIND(B, j - 1, n - 1), Bs0, + &SVDIND(B, j - 1, j), Bs0); } if (Vt) { - KokkosBatched::SerialApplyLeftGivensInternal::invoke( - G, n, &SVDIND(Vt, n - 1, 0), Vts1, &SVDIND(Vt, j, 0), Vts1); + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, n, &SVDIND(Vt, n - 1, 0), Vts1, + &SVDIND(Vt, j, 0), Vts1); } } } template - KOKKOS_INLINE_FUNCTION static void bidiagonalize( - int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, - int Us1, value_type* Vt, int Vts0, int Vts1, value_type* work) { + KOKKOS_INLINE_FUNCTION static void bidiagonalize(int m, int n, value_type* A, int As0, int As1, value_type* U, + int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, + value_type* work) { using KAT = Kokkos::ArithTraits; value_type tau; for (int i = 0; i < n; i++) { // Eliminating column i of A below the diagonal - KokkosBatched::SerialLeftHouseholderInternal::invoke( - m - i - 1, &SVDIND(A, i, i), &SVDIND(A, i + 1, i), As0, &tau); + KokkosBatched::SerialLeftHouseholderInternal::invoke(m - i - 1, &SVDIND(A, i, i), + &SVDIND(A, i + 1, i), As0, &tau); if (n - i > 1) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( - m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, - &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), As0, As1, - work); + m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), + As0, As1, work); } if (U) { KokkosBatched::SerialApplyRightHouseholderInternal::invoke( - m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), - Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work); + m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work); } // Zero out A subdiag explicitly (NOTE: may not be necessary...) for (int j = i + 1; j < m; j++) { @@ -203,19 +190,17 @@ struct SerialSVDInternal { } if (i < n - 2) { // Eliminating row i of A to the right of the 1st superdiagonal - KokkosBatched::SerialLeftHouseholderInternal::invoke( - n - i - 2, &SVDIND(A, i, i + 1), &SVDIND(A, i, i + 2), As1, &tau); + KokkosBatched::SerialLeftHouseholderInternal::invoke(n - i - 2, &SVDIND(A, i, i + 1), + &SVDIND(A, i, i + 2), As1, &tau); if (m - i > 1) { - KokkosBatched::SerialApplyRightHouseholderInternal::invoke< - value_type>(m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, - &SVDIND(A, i + 1, i + 1), As0, - &SVDIND(A, i + 1, i + 2), As0, As1, work); + KokkosBatched::SerialApplyRightHouseholderInternal::invoke( + m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0, + &SVDIND(A, i + 1, i + 2), As0, As1, work); } if (Vt) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( - n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, - &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, Vts1, - work); + n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, + Vts1, work); } // Zero out A superdiag row explicitly for (int j = i + 2; j < n; j++) { @@ -229,11 +214,8 @@ struct SerialSVDInternal { // U and Vt to maintain the product U*B*Vt. At the end, the singular values // are copied to sigma. template - KOKKOS_INLINE_FUNCTION static void bidiSVD(int m, int n, value_type* B, - int Bs0, int Bs1, value_type* U, - int Us0, int Us1, value_type* Vt, - int Vts0, int Vts1, - value_type* sigma, int ss, + KOKKOS_INLINE_FUNCTION static void bidiSVD(int m, int n, value_type* B, int Bs0, int Bs1, value_type* U, int Us0, + int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss, const value_type& tol) { using KAT = Kokkos::ArithTraits; const value_type eps = Kokkos::ArithTraits::epsilon(); @@ -242,8 +224,7 @@ struct SerialSVDInternal { while (true) { // Zero out tiny superdiagonal entries for (int i = 0; i < n - 1; i++) { - if (fabs(SVDIND(B, i, i + 1)) < - eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))) || + if (fabs(SVDIND(B, i, i + 1)) < eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))) || fabs(SVDIND(B, i, i + 1)) < tol) { SVDIND(B, i, i + 1) = KAT::zero(); } @@ -283,8 +264,7 @@ struct SerialSVDInternal { } int nsub = q - p; // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n - svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub, - Bs0, Bs1, Us0, Us1, Vts0, Vts1); + svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub, Bs0, Bs1, Us0, Us1, Vts0, Vts1); } for (int i = 0; i < n; i++) { sigma[i * ss] = SVDIND(B, i, i); @@ -294,11 +274,8 @@ struct SerialSVDInternal { // Convert SVD into conventional form: singular values positive and in // descending order template - KOKKOS_INLINE_FUNCTION static void postprocessSVD(int m, int n, value_type* U, - int Us0, int Us1, - value_type* Vt, int Vts0, - int Vts1, value_type* sigma, - int ss) { + KOKKOS_INLINE_FUNCTION static void postprocessSVD(int m, int n, value_type* U, int Us0, int Us1, value_type* Vt, + int Vts0, int Vts1, value_type* sigma, int ss) { // First step: flip signs on negative singular values for (int i = 0; i < n; i++) { if (sigma[i * ss] < 0) { @@ -327,23 +304,19 @@ struct SerialSVDInternal { if (i != maxloc) { SVDSWAP(sigma[i * ss], sigma[maxloc * ss]); if (U) { - for (int j = 0; j < m; j++) - SVDSWAP(SVDIND(U, j, i), SVDIND(U, j, maxloc)) + for (int j = 0; j < m; j++) SVDSWAP(SVDIND(U, j, i), SVDIND(U, j, maxloc)) } if (Vt) { - for (int j = 0; j < n; j++) - SVDSWAP(SVDIND(Vt, i, j), SVDIND(Vt, maxloc, j)) + for (int j = 0; j < n; j++) SVDSWAP(SVDIND(Vt, i, j), SVDIND(Vt, maxloc, j)) } } } } template - KOKKOS_INLINE_FUNCTION static int invoke( - int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, - int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss, - value_type* work, - value_type tol = Kokkos::ArithTraits::zero()) { + KOKKOS_INLINE_FUNCTION static int invoke(int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, + int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss, + value_type* work, value_type tol = Kokkos::ArithTraits::zero()) { // First, if m < n, need to instead compute (V, s, U^T) = A^T. // This just means swapping U & Vt, and implicitly transposing A, U and Vt. if (m < n) { @@ -356,12 +329,10 @@ struct SerialSVDInternal { SVDSWAP(Us1, Vts0); } if (U) { - KokkosBatched::SerialSetIdentityInternal::invoke(m, m, U, Us0, - Us1); + KokkosBatched::SerialSetIdentityInternal::invoke(m, m, U, Us0, Us1); } if (Vt) { - KokkosBatched::SerialSetIdentityInternal::invoke(n, n, Vt, - Vts0, Vts1); + KokkosBatched::SerialSetIdentityInternal::invoke(n, n, Vt, Vts0, Vts1); } if (m == 0 || n == 0) { // sigma is length 0, so there's nothing left to compute diff --git a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp index 22a599ed58..41e525d2ba 100644 --- a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp @@ -30,12 +30,9 @@ namespace KokkosBatched { /// struct SerialSchur2x2Internal { template - KOKKOS_INLINE_FUNCTION static int invoke(RealType* alpha00, RealType* alpha01, - RealType* alpha10, RealType* alpha11, - Kokkos::pair* G, - Kokkos::complex* lambda1, - Kokkos::complex* lambda2, - bool* is_complex) { + KOKKOS_INLINE_FUNCTION static int invoke(RealType* alpha00, RealType* alpha01, RealType* alpha10, RealType* alpha11, + Kokkos::pair* G, Kokkos::complex* lambda1, + Kokkos::complex* lambda2, bool* is_complex) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; const real_type zero(0), one(1), half(0.5), minus_one(-1); @@ -70,8 +67,7 @@ struct SerialSchur2x2Internal { *lambda1 = Kokkos::complex(*alpha00, zero); *lambda2 = Kokkos::complex(*alpha11, zero); *is_complex = false; - } else if (ats::abs(*alpha00 - *alpha11) < tol && - (*alpha01) * (*alpha10) > zero) { + } else if (ats::abs(*alpha00 - *alpha11) < tol && (*alpha01) * (*alpha10) > zero) { // no rotation (already the standard schur form) *G = Kokkos::pair(one, zero); /// two real eigen values @@ -84,9 +80,8 @@ struct SerialSchur2x2Internal { const real_type b = (*alpha01) + (*alpha10); const real_type l = ats::sqrt(a * a + b * b); const real_type c = ats::sqrt(half * (one + ats::abs(b) / l)); - const real_type s = - -((half * a) / (l * c)) * (b > zero ? one : minus_one); - *G = Kokkos::pair(c, s); + const real_type s = -((half * a) / (l * c)) * (b > zero ? one : minus_one); + *G = Kokkos::pair(c, s); /// [ gamma sigma ][ alpha00 alpha01 [ gamma -sigma --> [ alpha11 /// -alpha10 /// -sigma gamma ] alpha10 alpha11 ] sigma gamma ] 0 alpha00] @@ -105,19 +100,17 @@ struct SerialSchur2x2Internal { const real_type mult_alpha_offdiags = (*alpha10) * (*alpha01); if (mult_alpha_offdiags > zero) { /// transforms the matrix into a upper triangular - const real_type sqrt_mult_alpha_offdiags = - ats::sqrt(mult_alpha_offdiags); + const real_type sqrt_mult_alpha_offdiags = ats::sqrt(mult_alpha_offdiags); /// redefine the rotation matrix // const real_type sqrt_abs_alpha01 = ats::sqrt(ats::abs(*alpha01)); // const real_type sqrt_abs_alpha10 = ats::sqrt(ats::abs(*alpha10)); const real_type abs_sum_offidags = ats::abs((*alpha01) + (*alpha10)); - const real_type c1 = ats::sqrt(ats::abs(*alpha01) / abs_sum_offidags); - const real_type s1 = ats::sqrt(ats::abs(*alpha10) / abs_sum_offidags); - const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one; + const real_type c1 = ats::sqrt(ats::abs(*alpha01) / abs_sum_offidags); + const real_type s1 = ats::sqrt(ats::abs(*alpha10) / abs_sum_offidags); + const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one; - *G = Kokkos::pair(c * c1 - s * s1, - c * s1 + s * c1); + *G = Kokkos::pair(c * c1 - s * s1, c * s1 + s * c1); /// apply rotation to 2x2 matrix so that alpha10 becomes zero *alpha00 = tmp + sign_alpha10 * sqrt_mult_alpha_offdiags; @@ -131,12 +124,10 @@ struct SerialSchur2x2Internal { *is_complex = false; } else { /// two complex eigen values - const real_type sqrt_mult_alpha_offdiags = - ats::sqrt(-mult_alpha_offdiags); - *lambda1 = Kokkos::complex(tmp, sqrt_mult_alpha_offdiags); - *lambda2 = - Kokkos::complex(lambda1->real(), -lambda1->imag()); - *is_complex = true; + const real_type sqrt_mult_alpha_offdiags = ats::sqrt(-mult_alpha_offdiags); + *lambda1 = Kokkos::complex(tmp, sqrt_mult_alpha_offdiags); + *lambda2 = Kokkos::complex(lambda1->real(), -lambda1->imag()); + *is_complex = true; } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp index c7f35d5c4f..c6d55b301b 100644 --- a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp @@ -68,33 +68,27 @@ struct SerialSchurInternal { /// returns -1. template KOKKOS_INLINE_FUNCTION static int invoke(const int m, - /* */ RealType *H, const int hs0, - const int hs1, - /* */ RealType *Z, const int zs0, - const int zs1, - /* */ RealType *w, const int wlen, - const bool restart = false, + /* */ RealType *H, const int hs0, const int hs1, + /* */ RealType *Z, const int zs0, const int zs1, + /* */ RealType *w, const int wlen, const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; typedef Kokkos::ArithTraits ats; const real_type /* one(1), */ zero(0), tol = 1e2 * ats::epsilon(); const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; - if (wlen < m * 5) - Kokkos::abort("Error: provided workspace is smaller than 3*m"); + if (wlen < m * 5) Kokkos::abort("Error: provided workspace is smaller than 3*m"); int r_val = 0; if (restart) { - if (m <= 2) - Kokkos::abort("Error: restart option cannot be used for m=1 or m=2"); + if (m <= 2) Kokkos::abort("Error: restart option cannot be used for m=1 or m=2"); } else { /// do not touch input /// SerialSetIdentityInternal::invoke(m, Z, zs0, zs1); } // workspaces - real_type *subdiags = w; - Kokkos::pair *Gs = - (Kokkos::pair *)(w + m); + real_type *subdiags = w; + Kokkos::pair *Gs = (Kokkos::pair *)(w + m); if (!restart) { /// initialize workspace and Gs for (int i = 0; i < m; ++i) subdiags[i] = zero; @@ -111,8 +105,7 @@ struct SerialSchurInternal { bool is_complex; Kokkos::complex lambda1, lambda2; Kokkos::pair G; - SerialSchur2x2Internal::invoke(H, H + hs1, H + hs0, H + hs, &G, - &lambda1, &lambda2, &is_complex); + SerialSchur2x2Internal::invoke(H, H + hs1, H + hs0, H + hs, &G, &lambda1, &lambda2, &is_complex); G.second = -G.second; // transpose SerialApplyRightGivensInternal::invoke(G, 2, Z, zs0, Z + zs1, zs0); @@ -171,49 +164,37 @@ struct SerialSchurInternal { real_type *sub2x2 = H + (mend - 2) * hs; if (2 == mdiff) { Kokkos::pair G; - SerialSchur2x2Internal::invoke(sub2x2, sub2x2 + hs1, - sub2x2 + hs0, sub2x2 + hs, &G, - &lambda1, &lambda2, &is_complex); + SerialSchur2x2Internal::invoke(sub2x2, sub2x2 + hs1, sub2x2 + hs0, sub2x2 + hs, &G, &lambda1, &lambda2, + &is_complex); subdiags[mend - 1] = sub2x2[hs0]; /// apply G' from left G.second = -G.second; - SerialApplyLeftGivensInternal::invoke( - G, m - mend, sub2x2 + 2 * hs1, hs1, sub2x2 + hs0 + 2 * hs1, - hs1); + SerialApplyLeftGivensInternal::invoke(G, m - mend, sub2x2 + 2 * hs1, hs1, sub2x2 + hs0 + 2 * hs1, hs1); /// apply (G')' from right - SerialApplyRightGivensInternal::invoke( - G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, - sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, + sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); sub2x2[hs0] = zero; /// apply (G')' from right to compute Z - SerialApplyRightGivensInternal::invoke( - G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, zs0); + SerialApplyRightGivensInternal::invoke(G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, zs0); } else { - SerialWilkinsonShiftInternal::invoke( - sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, - &lambda2, &is_complex); + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1, + &lambda2, &is_complex); - SerialFrancisInternal::invoke(mbeg, mend, m, H, hs0, hs1, - lambda1, lambda2, is_complex, Gs, - true); + SerialFrancisInternal::invoke(mbeg, mend, m, H, hs0, hs1, lambda1, lambda2, is_complex, Gs, true); /* */ auto &val1 = *(sub2x2 + hs0); /* */ auto &val2 = *(sub2x2 - hs1); const auto abs_val1 = ats::abs(val1); const auto abs_val2 = ats::abs(val2); for (int i = mbeg; i < (mend - 1); ++i) { - const Kokkos::pair G0( - Gs[2 * i].first, -Gs[2 * i].second); - const Kokkos::pair G1( - Gs[2 * i + 1].first, -Gs[2 * i + 1].second); - SerialApplyRightGivensInternal::invoke( - G0, m, Z + i * zs1, zs0, Z + i * zs1 + 1 * zs1, zs0); - SerialApplyRightGivensInternal::invoke( - G1, m, Z + i * zs1, zs0, Z + i * zs1 + 2 * zs1, zs0); + const Kokkos::pair G0(Gs[2 * i].first, -Gs[2 * i].second); + const Kokkos::pair G1(Gs[2 * i + 1].first, -Gs[2 * i + 1].second); + SerialApplyRightGivensInternal::invoke(G0, m, Z + i * zs1, zs0, Z + i * zs1 + 1 * zs1, zs0); + SerialApplyRightGivensInternal::invoke(G1, m, Z + i * zs1, zs0, Z + i * zs1 + 2 * zs1, zs0); } /// convergence check @@ -222,28 +203,23 @@ struct SerialSchurInternal { } else if (abs_val2 < tol) { /// preserve the standard schur form Kokkos::pair G; - SerialSchur2x2Internal::invoke( - sub2x2, sub2x2 + hs1, sub2x2 + hs0, sub2x2 + hs, &G, - &lambda1, &lambda2, &is_complex); + SerialSchur2x2Internal::invoke(sub2x2, sub2x2 + hs1, sub2x2 + hs0, sub2x2 + hs, &G, &lambda1, + &lambda2, &is_complex); subdiags[mend - 1] = val1; /// apply G' from left G.second = -G.second; - SerialApplyLeftGivensInternal::invoke( - G, m - mend, sub2x2 + 2 * hs1, hs1, - sub2x2 + hs0 + 2 * hs1, hs1); + SerialApplyLeftGivensInternal::invoke(G, m - mend, sub2x2 + 2 * hs1, hs1, sub2x2 + hs0 + 2 * hs1, + hs1); // apply (G')' from right - SerialApplyRightGivensInternal::invoke( - G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, - sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); + SerialApplyRightGivensInternal::invoke(G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0, + sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0); val1 = zero; val2 = zero; // apply (G')' from right - SerialApplyRightGivensInternal::invoke( - G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, - zs0); + SerialApplyRightGivensInternal::invoke(G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, zs0); } } } diff --git a/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp b/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp index e826c4cbb7..9219f3a9ec 100644 --- a/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp @@ -29,8 +29,7 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION int SerialSetIdentity::invoke(const AViewType &A) { - return SerialSetIdentityInternal::invoke(A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1()); + return SerialSetIdentityInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } /// @@ -39,10 +38,8 @@ KOKKOS_INLINE_FUNCTION int SerialSetIdentity::invoke(const AViewType &A) { template template -KOKKOS_INLINE_FUNCTION int TeamSetIdentity::invoke( - const MemberType &member, const AViewType &A) { - return TeamSetIdentityInternal::invoke(member, A.extent(0), A.extent(1), - A.data(), A.stride_0(), A.stride_1()); +KOKKOS_INLINE_FUNCTION int TeamSetIdentity::invoke(const MemberType &member, const AViewType &A) { + return TeamSetIdentityInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } } // end namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp b/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp index 7a89767526..f5afb5c79c 100644 --- a/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp @@ -28,8 +28,7 @@ namespace KokkosBatched { struct SerialSetIdentityInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { const ValueType one(1), zero(0); for (int j = 0; j < n; ++j) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -49,10 +48,8 @@ struct SerialSetIdentityInternal { /// ================== struct TeamSetIdentityInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { const ValueType one(1), zero(0); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -70,15 +67,12 @@ struct TeamSetIdentityInternal { /// ======================== struct TeamVectorSetIdentityInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { const ValueType one(1), zero(0); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] = i == j ? one : zero; }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A[i * as0 + j * as1] = i == j ? one : zero; }); }); return 0; diff --git a/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp b/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp index 844c3f72c5..09e94ab5f3 100644 --- a/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp @@ -27,11 +27,8 @@ namespace KokkosBatched { /// ==================== struct SerialSetLowerTriangularInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const int dist, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const int dist, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { for (int j = 0; j < n; ++j) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -47,18 +44,14 @@ struct SerialSetLowerTriangularInternal { struct TeamVectorSetLowerTriangularInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const int dist, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const int dist, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { const int jdist = j + dist; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), - [=](const int &i) { - if (i >= jdist) A[i * as0 + j * as1] = alpha; - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [=](const int &i) { + if (i >= jdist) A[i * as0 + j * as1] = alpha; + }); }); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp index 2e356f818e..c0963447c4 100644 --- a/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp @@ -36,19 +36,16 @@ namespace KokkosBatched { struct SerialShiftedTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, - const ValueTypeA *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, const ValueTypeA *KOKKOS_RESTRICT A, const int as0, const int as1, - /* */ ValueTypeB *KOKKOS_RESTRICT b, - const int bs0, + /* */ ValueTypeB *KOKKOS_RESTRICT b, const int bs0, const int *KOKKOS_RESTRICT blks) { const int as = as0 + as1; int p = 0; for (; p < m;) { const int blk = blks[p], iend = m - p - blk; - assert(((blk == 1) || (blk == 2)) && - "ShiftedTrsvLower: blocks are not 1x1 or 2x2"); + assert(((blk == 1) || (blk == 2)) && "ShiftedTrsvLower: blocks are not 1x1 or 2x2"); if (blk == 1) { const auto alpha11 = A[p * as] - lambda; ValueTypeB *KOKKOS_RESTRICT beta1 = b + p * bs0; @@ -84,9 +81,7 @@ struct SerialShiftedTrsvInternalLower { const ValueTypeA *KOKKOS_RESTRICT A21 = A + p * as + 2 * as0; ValueTypeB *KOKKOS_RESTRICT b2 = beta1 + 2 * bs0; - for (int i = 0; i < iend; ++i) - b2[i * bs0] -= - (A21[i * as0] * (*beta1) + A21[i * as0 + as1] * (*beta2)); + for (int i = 0; i < iend; ++i) b2[i * bs0] -= (A21[i * as0] * (*beta1) + A21[i * as0 + as1] * (*beta2)); } } p += blk; @@ -101,11 +96,9 @@ struct SerialShiftedTrsvInternalLower { struct SerialShiftedTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, - const ValueTypeA *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda, const ValueTypeA *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueTypeB *KOKKOS_RESTRICT b, - const int bs0, + /**/ ValueTypeB *KOKKOS_RESTRICT b, const int bs0, const int *KOKKOS_RESTRICT blks) { const int as = as0 + as1; @@ -114,10 +107,9 @@ struct SerialShiftedTrsvInternalUpper { int p = m - 1; for (; p >= 0;) { const int blk = blks[p], iend = p + 1 - blk; - assert(((blk == 1) || (blk == 2)) && - "ShiftedTrsvUpper: blocks are not 1x1 or 2x2"); + assert(((blk == 1) || (blk == 2)) && "ShiftedTrsvUpper: blocks are not 1x1 or 2x2"); if (blk == 1) { - const auto alpha11 = A[p * as] - lambda; + const auto alpha11 = A[p * as] - lambda; /**/ ValueTypeB *KOKKOS_RESTRICT beta1 = b + p * bs0; // with KOKKOS_RESTRICT a compiler assumes that the pointer is not @@ -148,9 +140,7 @@ struct SerialShiftedTrsvInternalUpper { if (iend) { const ValueTypeA *KOKKOS_RESTRICT A01 = A + p_minus_one * as1; - for (int i = 0; i < iend; ++i) - b0[i * bs0] -= - (A01[i * as0] * (*beta1) + A01[i * as0 + as1] * (*beta2)); + for (int i = 0; i < iend; ++i) b0[i * bs0] -= (A01[i * as0] * (*beta1) + A01[i * as0 + as1] * (*beta2)); } } p -= blk; diff --git a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp index 4f6f81216d..3b85a26294 100644 --- a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp @@ -28,26 +28,21 @@ namespace KokkosBatched { /// =============== template struct TeamVectorSolveUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const UViewType &U, - const TViewType &T, const VViewType &V, const pViewType &p, - const XViewType &X, const BViewType &B, const wViewType &w) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const UViewType &U, + const TViewType &T, const VViewType &V, const pViewType &p, + const XViewType &X, const BViewType &B, const wViewType &w) { if (BViewType::rank == 1) - TeamVectorSolveUTV_Internal::invoke( - member, matrix_rank, T.extent(0), V.extent(0), U.data(), U.stride(0), - U.stride(1), T.data(), T.stride(0), T.stride(1), V.data(), - V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(), - X.stride(0), B.data(), B.stride(0), w.data()); + TeamVectorSolveUTV_Internal::invoke(member, matrix_rank, T.extent(0), V.extent(0), U.data(), U.stride(0), + U.stride(1), T.data(), T.stride(0), T.stride(1), V.data(), V.stride(0), + V.stride(1), p.data(), p.stride(0), X.data(), X.stride(0), B.data(), + B.stride(0), w.data()); else - TeamVectorSolveUTV_Internal::invoke( - member, matrix_rank, T.extent(0), V.extent(0), B.extent(1), U.data(), - U.stride(0), U.stride(1), T.data(), T.stride(0), T.stride(1), - V.data(), V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(), - X.stride(0), X.stride(1), B.data(), B.stride(0), B.stride(1), - w.data()); + TeamVectorSolveUTV_Internal::invoke(member, matrix_rank, T.extent(0), V.extent(0), B.extent(1), U.data(), + U.stride(0), U.stride(1), T.data(), T.stride(0), T.stride(1), V.data(), + V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(), X.stride(0), + X.stride(1), B.data(), B.stride(0), B.stride(1), w.data()); return 0; } }; diff --git a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp index 71050504aa..18440745eb 100644 --- a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp @@ -33,14 +33,13 @@ namespace KokkosBatched { /// =================== struct TeamVectorSolveUTV_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const int m, - const int /*n*/, const ValueType *U, const int us0, const int us1, - const ValueType *T, const int ts0, const int ts1, const ValueType *V, - const int vs0, const int vs1, const IntType *p, const int ps0, - /* */ ValueType *x, const int xs0, - /* */ ValueType *b, const int bs0, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const int m, + const int /*n*/, const ValueType *U, const int us0, const int us1, + const ValueType *T, const int ts0, const int ts1, const ValueType *V, + const int vs0, const int vs1, const IntType *p, const int ps0, + /* */ ValueType *x, const int xs0, + /* */ ValueType *b, const int bs0, + /* */ ValueType *w) { typedef ValueType value_type; // typedef IntType int_type; @@ -49,40 +48,36 @@ struct TeamVectorSolveUTV_Internal { if (matrix_rank < m) { /// w = U^T b - KokkosBlas::Impl::TeamVectorGemvInternal::invoke( - member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, w, ws0); + KokkosBlas::Impl::TeamVectorGemvInternal::invoke(member, matrix_rank, m, one, U, us1, us0, + b, bs0, zero, w, ws0); /// w = T^{-1} w - TeamVectorTrsvInternalLower::invoke( - member, false, matrix_rank, one, T, ts0, ts1, w, ws0); + TeamVectorTrsvInternalLower::invoke(member, false, matrix_rank, one, T, ts0, ts1, w, ws0); /// x = V^T w - KokkosBlas::Impl::TeamVectorGemvInternal::invoke( - member, m, matrix_rank, one, V, vs1, vs0, w, ws0, zero, x, xs0); + KokkosBlas::Impl::TeamVectorGemvInternal::invoke(member, m, matrix_rank, one, V, vs1, vs0, + w, ws0, zero, x, xs0); } else { - KokkosBlas::Impl::TeamVectorGemvInternal::invoke( - member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, x, xs0); + KokkosBlas::Impl::TeamVectorGemvInternal::invoke(member, matrix_rank, m, one, U, us1, us0, + b, bs0, zero, x, xs0); - TeamVectorTrsvInternalUpper::invoke( - member, false, matrix_rank, one, T, ts0, ts1, x, xs0); + TeamVectorTrsvInternalUpper::invoke(member, false, matrix_rank, one, T, ts0, ts1, x, xs0); } /// x = P^T x - TeamVectorApplyPivotVectorBackwardInternal ::invoke(member, m, p, ps0, x, - xs0); + TeamVectorApplyPivotVectorBackwardInternal ::invoke(member, m, p, ps0, x, xs0); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const int m, const int n, - const int nrhs, const ValueType *U, const int us0, const int us1, - const ValueType *T, const int ts0, const int ts1, const ValueType *V, - const int vs0, const int vs1, const IntType *p, const int ps0, - /* */ ValueType *X, const int xs0, const int xs1, - /* */ ValueType *B, const int bs0, const int bs1, - /* */ ValueType *w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const int m, const int n, + const int nrhs, const ValueType *U, const int us0, const int us1, + const ValueType *T, const int ts0, const int ts1, const ValueType *V, + const int vs0, const int vs1, const IntType *p, const int ps0, + /* */ ValueType *X, const int xs0, const int xs1, + /* */ ValueType *B, const int bs0, const int bs1, + /* */ ValueType *w) { typedef ValueType value_type; // typedef IntType int_type; @@ -96,37 +91,33 @@ struct TeamVectorSolveUTV_Internal { /// T is matrix_rank x matrix_rank /// V is matrix_rank x n /// W = U^T B - TeamVectorGemmInternal::invoke( - member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, W, - ws0, ws1); + TeamVectorGemmInternal::invoke(member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, + zero, W, ws0, ws1); member.team_barrier(); /// W = T^{-1} W - TeamVectorTrsmInternalLeftLower::invoke( - member, false, matrix_rank, nrhs, one, T, ts0, ts1, W, ws0, ws1); + TeamVectorTrsmInternalLeftLower::invoke(member, false, matrix_rank, nrhs, one, T, ts0, ts1, + W, ws0, ws1); member.team_barrier(); /// X = V^T W - TeamVectorGemmInternal::invoke( - member, n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, zero, X, - xs0, xs1); + TeamVectorGemmInternal::invoke(member, n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, + zero, X, xs0, xs1); member.team_barrier(); } else { /// W = U^T B - TeamVectorGemmInternal::invoke( - member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, X, - xs0, xs1); + TeamVectorGemmInternal::invoke(member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, + zero, X, xs0, xs1); member.team_barrier(); /// X = T^{-1} X - TeamVectorTrsmInternalLeftUpper::invoke( - member, false, matrix_rank, nrhs, one, T, ts0, ts1, X, xs0, xs1); + TeamVectorTrsmInternalLeftUpper::invoke(member, false, matrix_rank, nrhs, one, T, ts0, ts1, + X, xs0, xs1); member.team_barrier(); } /// X = P^T X - TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, nrhs, n, p, ps0, - X, xs0, xs1); + TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, nrhs, n, p, ps0, X, xs0, xs1); return 0; } diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp index 675e73f744..853e453b89 100644 --- a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp @@ -25,17 +25,12 @@ namespace KokkosBatched { template -KOKKOS_INLINE_FUNCTION static int checkTbsvInput( - [[maybe_unused]] const AViewType &A, [[maybe_unused]] const XViewType &x, - [[maybe_unused]] const int k) { - static_assert(Kokkos::is_view::value, - "KokkosBatched::tbsv: AViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::tbsv: XViewType is not a Kokkos::View."); - static_assert(AViewType::rank == 2, - "KokkosBatched::tbsv: AViewType must have rank 2."); - static_assert(XViewType::rank == 1, - "KokkosBatched::tbsv: XViewType must have rank 1."); +KOKKOS_INLINE_FUNCTION static int checkTbsvInput([[maybe_unused]] const AViewType &A, + [[maybe_unused]] const XViewType &x, [[maybe_unused]] const int k) { + static_assert(Kokkos::is_view::value, "KokkosBatched::tbsv: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::tbsv: XViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::tbsv: AViewType must have rank 2."); + static_assert(XViewType::rank == 1, "KokkosBatched::tbsv: XViewType must have rank 1."); #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) if (k < 0) { @@ -70,97 +65,79 @@ KOKKOS_INLINE_FUNCTION static int checkTbsvInput( //// Lower non-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Lower transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalLowerTranspose::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Lower conjugate-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalLowerTranspose::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Upper non-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Upper transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalUpperTranspose::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; //// Upper conjugate-transpose //// template -struct SerialTbsv { +struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &x, const int k) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x, const int k) { auto info = checkTbsvInput(A, x, k); if (info) return info; return SerialTbsvInternalUpperTranspose::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), k); + ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), k); } }; diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp index d2f5df4649..64221008cc 100644 --- a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp @@ -34,20 +34,15 @@ namespace KokkosBatched { template struct SerialTbsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int an, - const ValueType *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalLower::invoke( - const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalLower::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -76,20 +71,16 @@ SerialTbsvInternalLower::invoke( template struct SerialTbsvInternalLowerTranspose { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalLowerTranspose::invoke( - const bool use_unit_diag, const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalLowerTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -102,12 +93,9 @@ SerialTbsvInternalLowerTranspose::invoke( #pragma unroll #endif for (int i = Kokkos::min(an - 1, j + k); i > j; --i) { - temp -= - Kokkos::ArithTraits::conj(A[(i - j) * as0 + j * as1]) * - x[i * xs0]; + temp -= Kokkos::ArithTraits::conj(A[(i - j) * as0 + j * as1]) * x[i * xs0]; } - if (!use_unit_diag) - temp = temp / Kokkos::ArithTraits::conj(A[0 + j * as1]); + if (!use_unit_diag) temp = temp / Kokkos::ArithTraits::conj(A[0 + j * as1]); } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -130,20 +118,15 @@ SerialTbsvInternalLowerTranspose::invoke( template struct SerialTbsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int an, - const ValueType *KOKKOS_RESTRICT A, + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalUpper::invoke( - const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalUpper::invoke( + const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -172,20 +155,16 @@ SerialTbsvInternalUpper::invoke( template struct SerialTbsvInternalUpperTranspose { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT x, - const int xs0, const int k); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int an, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTbsvInternalUpperTranspose::invoke( - const bool use_unit_diag, const bool do_conj, const int an, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTbsvInternalUpperTranspose::invoke( + const bool use_unit_diag, const bool do_conj, const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -197,13 +176,9 @@ SerialTbsvInternalUpperTranspose::invoke( #pragma unroll #endif for (int i = Kokkos::max(0, j - k); i < j; ++i) { - temp -= Kokkos::ArithTraits::conj( - A[(i + k - j) * as0 + j * as1]) * - x[i * xs0]; + temp -= Kokkos::ArithTraits::conj(A[(i + k - j) * as0 + j * as1]) * x[i * xs0]; } - if (!use_unit_diag) - temp = - temp / Kokkos::ArithTraits::conj(A[k * as0 + j * as1]); + if (!use_unit_diag) temp = temp / Kokkos::ArithTraits::conj(A[k * as0 + j * as1]); } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll diff --git a/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp index 044af0814c..6313d817c6 100644 --- a/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp @@ -23,164 +23,116 @@ namespace KokkosBatched { //// Lower non-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; //// Lower transpose ///// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; //// Lower conjugate-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightUpper::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; //// Upper non-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightUpper::invoke( - ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; //// Upper transpose ///// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightLower::invoke( - ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; //// Upper conjugate-transpose //// template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrmm { +struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { return SerialTrmmInternalRightLower::invoke( - ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), - B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), - B.stride_0(), B.stride_1()); + ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp index 3e4024974b..c36d04213d 100644 --- a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp @@ -27,41 +27,37 @@ namespace KokkosBatched { template struct SerialTrmmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template struct SerialTrmmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template struct SerialTrmmInternalRightLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template struct SerialTrmmInternalRightUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const bool use_unit_diag, const bool do_conj, const int am, const int an, - const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const bool do_conj, const int am, const int an, + const int bm, const int bn, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; // ech-note: use_unit_diag intentionally ignored for now. Compiler can optimize @@ -70,11 +66,9 @@ struct SerialTrmmInternalRightUpper { // if use_unit_diag. template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalLeftLower::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalLeftLower::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -87,27 +81,23 @@ SerialTrmmInternalLeftLower::invoke( //} // printf("SerialTrmmInternalLeftLower\n"); - auto dotLowerLeftConj = - [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, const int __right_col) { - auto B_elems = __left_row; - ScalarType sum = 0; + auto dotLowerLeftConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { + auto B_elems = __left_row; + ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i <= B_elems; i++) { - // sum += A[left_row, i] * B[i, right_col] - sum += AT::conj(__A[__left_row * __as0 + i * __as1]) * - __B[i * __bs0 + __bs1 * __right_col]; - } - return sum; - }; + for (int i = 0; i <= B_elems; i++) { + // sum += A[left_row, i] * B[i, right_col] + sum += AT::conj(__A[__left_row * __as0 + i * __as1]) * __B[i * __bs0 + __bs1 * __right_col]; + } + return sum; + }; - auto dotLowerLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotLowerLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __left_row, + ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, const int __right_col) { auto B_elems = __left_row; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -115,8 +105,7 @@ SerialTrmmInternalLeftLower::invoke( #endif for (int i = 0; i <= B_elems; i++) { // sum += A[left_row, i] * B[i, right_col] - sum += __A[__left_row * __as0 + i * __as1] * - __B[i * __bs0 + __bs1 * __right_col]; + sum += __A[__left_row * __as0 + i * __as1] * __B[i * __bs0 + __bs1 * __right_col]; } return sum; }; @@ -126,8 +115,7 @@ SerialTrmmInternalLeftLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -138,8 +126,7 @@ SerialTrmmInternalLeftLower::invoke( #endif for (int n = 0; n < right_n; n++) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotLowerLeftConj(A, as0, as1, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotLowerLeftConj(A, as0, as1, m, B, bs0, bs1, n); } else { B[m * bs0 + n * bs1] = dotLowerLeft(A, as0, as1, m, B, bs0, bs1, n); } @@ -155,11 +142,9 @@ SerialTrmmInternalLeftLower::invoke( // if use_unit_diag. template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalRightLower::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalRightLower::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -174,11 +159,9 @@ SerialTrmmInternalRightLower::invoke( // Lower triangular matrix is on RHS with the base facing down. // Everytime we compute a new output row of B, we must shift over to the // right by one in A's column to ensure we skip the 0's. - auto dotLowerRightConj = [&](const ValueType *KOKKOS_RESTRICT __A, - const int __as0, const int __as1, const int __am, - const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotLowerRightConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __am, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { auto B_elems = __am - 1; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -186,16 +169,13 @@ SerialTrmmInternalRightLower::invoke( #endif for (int i = __right_col; i <= B_elems; i++) { // sum += B[left_row, i] * A[i, right_col] - sum += __B[__bs0 * __left_row + i * __bs1] * - AT::conj(__A[i * __as0 + __right_col * __as1]); + sum += __B[__bs0 * __left_row + i * __bs1] * AT::conj(__A[i * __as0 + __right_col * __as1]); } return sum; }; - auto dotLowerRight = [&](const ValueType *KOKKOS_RESTRICT __A, - const int __as0, const int __as1, const int __am, - const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, + auto dotLowerRight = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __am, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, const int __right_col) { auto B_elems = __am - 1; ScalarType sum = 0; @@ -204,8 +184,7 @@ SerialTrmmInternalRightLower::invoke( #endif for (int i = __right_col; i <= B_elems; i++) { // sum += B[left_row, i] * A[i, right_col] - sum += __B[__bs0 * __left_row + i * __bs1] * - __A[i * __as0 + __right_col * __as1]; + sum += __B[__bs0 * __left_row + i * __bs1] * __A[i * __as0 + __right_col * __as1]; } return sum; }; @@ -215,8 +194,7 @@ SerialTrmmInternalRightLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -227,11 +205,9 @@ SerialTrmmInternalRightLower::invoke( #endif for (int n = 0; n < right_n; n++) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotLowerRightConj(A, as0, as1, am, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotLowerRightConj(A, as0, as1, am, m, B, bs0, bs1, n); } else { - B[m * bs0 + n * bs1] = - dotLowerRight(A, as0, as1, am, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotLowerRight(A, as0, as1, am, m, B, bs0, bs1, n); } } } @@ -241,11 +217,9 @@ SerialTrmmInternalRightLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalLeftUpper::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalLeftUpper::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -257,11 +231,9 @@ SerialTrmmInternalLeftUpper::invoke( // conjOp = AT::conj; //} - auto dotUpperLeftConj = [&](const ValueType *KOKKOS_RESTRICT __A, - const int __as0, const int __as1, const int __an, - const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotUpperLeftConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __an, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { auto B_elems = __an - __left_row - 1; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -275,10 +247,9 @@ SerialTrmmInternalLeftUpper::invoke( return sum; }; - auto dotUpperLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __an, const int __left_row, - ValueType *KOKKOS_RESTRICT __B, const int __bs0, - const int __bs1, const int __right_col) { + auto dotUpperLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __an, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { auto B_elems = __an - __left_row - 1; ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -286,8 +257,7 @@ SerialTrmmInternalLeftUpper::invoke( #endif for (int i = 0; i <= B_elems; i++) { // sum += A[left_row, i+left_row] * B[i+left_row, right_col] - sum += __A[__left_row * __as0 + (i + __left_row) * __as1] * - __B[(i + __left_row) * __bs0 + __bs1 * __right_col]; + sum += __A[__left_row * __as0 + (i + __left_row) * __as1] * __B[(i + __left_row) * __bs0 + __bs1 * __right_col]; } return sum; }; @@ -297,8 +267,7 @@ SerialTrmmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -309,11 +278,9 @@ SerialTrmmInternalLeftUpper::invoke( #endif for (int n = 0; n < right_n; ++n) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotUpperLeftConj(A, as0, as1, an, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotUpperLeftConj(A, as0, as1, an, m, B, bs0, bs1, n); } else { - B[m * bs0 + n * bs1] = - dotUpperLeft(A, as0, as1, an, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotUpperLeft(A, as0, as1, an, m, B, bs0, bs1, n); } } } @@ -323,11 +290,9 @@ SerialTrmmInternalLeftUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrmmInternalRightUpper::invoke( - const bool /*use_unit_diag*/, const bool do_conj, const int am, - const int an, const int bm, const int bn, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrmmInternalRightUpper::invoke( + const bool /*use_unit_diag*/, const bool do_conj, const int am, const int an, const int bm, const int bn, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); typedef Kokkos::ArithTraits AT; @@ -339,47 +304,41 @@ SerialTrmmInternalRightUpper::invoke( // conjOp = AT::conj; //} - auto dotUpperRightConj = - [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, const int __right_col) { - auto B_elems = __right_col; - ScalarType sum = 0; + auto dotUpperRightConj = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, + const int __left_row, ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, + const int __right_col) { + auto B_elems = __right_col; + ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i <= B_elems; i++) { - // sum += B[left_row, i] * A[i, right_col] - sum += __B[__left_row * __bs0 + i * __bs1] * - AT::conj(__A[i * __as0 + __right_col * __as1]); - } - return sum; - }; - - auto dotUpperRight = - [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, - const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B, - const int __bs0, const int __bs1, const int __right_col) { - auto B_elems = __right_col; - ScalarType sum = 0; + for (int i = 0; i <= B_elems; i++) { + // sum += B[left_row, i] * A[i, right_col] + sum += __B[__left_row * __bs0 + i * __bs1] * AT::conj(__A[i * __as0 + __right_col * __as1]); + } + return sum; + }; + + auto dotUpperRight = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0, const int __as1, const int __left_row, + ValueType *KOKKOS_RESTRICT __B, const int __bs0, const int __bs1, const int __right_col) { + auto B_elems = __right_col; + ScalarType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i <= B_elems; i++) { - // sum += B[left_row, i] * A[i, right_col] - sum += __B[__left_row * __bs0 + i * __bs1] * - __A[i * __as0 + __right_col * __as1]; - } - return sum; - }; + for (int i = 0; i <= B_elems; i++) { + // sum += B[left_row, i] * A[i, right_col] + sum += __B[__left_row * __bs0 + i * __bs1] * __A[i * __as0 + __right_col * __as1]; + } + return sum; + }; if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -390,8 +349,7 @@ SerialTrmmInternalRightUpper::invoke( #endif for (int n = right_n - 1; n >= 0; --n) { if (do_conj) { - B[m * bs0 + n * bs1] = - dotUpperRightConj(A, as0, as1, m, B, bs0, bs1, n); + B[m * bs0 + n * bs1] = dotUpperRightConj(A, as0, as1, m, B, bs0, bs1, n); } else { B[m * bs0 + n * bs1] = dotUpperRight(A, as0, as1, m, B, bs0, bs1, n); } diff --git a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp index 4d094c24d2..694ac36fa0 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp @@ -29,43 +29,32 @@ namespace KokkosBatched { /// B := inv(tril(A)) (alpha*B) /// A(m x m), B(m x n) -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -75,28 +64,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; @@ -105,43 +88,32 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_RIGHT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_RIGHT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -151,54 +123,42 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_1(), B.stride_0()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_1(), B.stride_0()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_1(), B.stride_0()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(1), B.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_1(), B.stride_0()); } }; @@ -207,43 +167,32 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -253,28 +202,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_0(), A.stride_1(), + B.data(), B.stride_0(), B.stride_1()); } }; @@ -284,42 +227,31 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -330,28 +262,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftUpper::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; /// @@ -359,42 +285,31 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { typedef typename BViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = B.extent(0), n = B.extent(1); - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1 && B.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_1(), - (double *)B.data(), B.stride_1(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_1(), (double *)B.data(), B.stride_1(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1 && B.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)B.data(), B.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)B.data(), B.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -405,28 +320,22 @@ struct SerialTrsm -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; template -struct SerialTrsm { +struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B) { - return SerialTrsmInternalLeftLower::invoke( - ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, B.extent(0), B.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp index a44943e5d6..0e65d269f0 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp @@ -34,40 +34,31 @@ namespace KokkosBatched { template struct SerialTrsmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const int n, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, - const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftLower::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftLower::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1, jend = n; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = - iend ? B + (p + 1) * bs0 : NULL; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = iend ? B + (p + 1) * bs0 : NULL; if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; @@ -83,8 +74,7 @@ SerialTrsmInternalLeftLower::invoke( #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < jend; ++j) - B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; } } return 0; @@ -92,10 +82,9 @@ SerialTrsmInternalLeftLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftLower::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftLower::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -104,16 +93,14 @@ SerialTrsmInternalLeftLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, bs1); InnerGemmFixA gemm(as0, as1, bs0, bs1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; for (int p = 0; p < ib; p += mb) { @@ -121,7 +108,7 @@ SerialTrsmInternalLeftLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, jb, Bp); @@ -131,8 +118,7 @@ SerialTrsmInternalLeftLower::invoke( // gemm update for (int i = p + mb; i < ib; i += mb) { const int mm = (i + mb) > ib ? (ib - i) : mb; - gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, BB + p * bs0, - mm, jb, pb, BB + i * bs0); + gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, BB + p * bs0, mm, jb, pb, BB + i * bs0); } } }; @@ -151,29 +137,23 @@ SerialTrsmInternalLeftLower::invoke( template struct SerialTrsmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const int n, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, - const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftUpper::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftUpper::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -199,8 +179,7 @@ SerialTrsmInternalLeftUpper::invoke( #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < jend; ++j) - B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; } } } @@ -209,10 +188,9 @@ SerialTrsmInternalLeftUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsmInternalLeftUpper::invoke( - const bool use_unit_diag, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int SerialTrsmInternalLeftUpper::invoke( + const bool use_unit_diag, const int m, const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); @@ -221,8 +199,7 @@ SerialTrsmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); @@ -230,17 +207,15 @@ SerialTrsmInternalLeftUpper::invoke( InnerGemmFixA gemm(as0, as1, bs0, bs1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; for (int pp = 0; pp < ib; pp += mb) { - const int ptmp = ib - pp - mb, p = ptmp < 0 ? 0 : ptmp, - pb = mb + (ptmp < 0) * ptmp; + const int ptmp = ib - pp - mb, p = ptmp < 0 ? 0 : ptmp, pb = mb + (ptmp < 0) * ptmp; // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, jb, Bp); @@ -249,8 +224,7 @@ SerialTrsmInternalLeftUpper::invoke( // gemm update for (int i = 0; i < p; i += mb) { - gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, Bp, - (i + mb) > p ? (p - i) : mb, jb, pb, BB + i * bs0); + gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, Bp, (i + mb) > p ? (p - i) : mb, jb, pb, BB + i * bs0); } } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp index dbaba7fc6c..145f8e0c2d 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp @@ -34,17 +34,13 @@ namespace KokkosBatched { /// A(m x m), B(m x n) template -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_0(), B.stride_1()); } }; @@ -55,17 +51,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_1(), B.stride_0()); } }; @@ -76,17 +68,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), + B.stride_0(), B.stride_1()); } }; @@ -97,17 +85,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_0(), B.stride_1()); } }; @@ -118,17 +102,13 @@ struct TeamVectorTrsm -struct TeamVectorTrsm { +struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { return TeamVectorTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(), + B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index 3ee13f0b80..c1781a001c 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -32,30 +32,24 @@ namespace KokkosBatched { template struct TeamVectorTrsmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsmInternalLeftLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsmInternalLeftLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, - bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, - bs0, bs1); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -63,29 +57,23 @@ TeamVectorTrsmInternalLeftLower::invoke( int iend = m - p - 1; int jend = n; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = - iend ? B + (p + 1) * bs0 : NULL; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = iend ? B + (p + 1) * bs0 : NULL; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, iend), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, jend), [&](const int &j) { - // assume layout right for batched computation - B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, iend), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, jend), [&](const int &j) { + // assume layout right for batched computation + B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; + }); + }); } } return 0; @@ -94,31 +82,25 @@ TeamVectorTrsmInternalLeftLower::invoke( template struct TeamVectorTrsmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsmInternalLeftUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsmInternalLeftUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, - bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, - bs0, bs1); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -128,24 +110,20 @@ TeamVectorTrsmInternalLeftUpper::invoke( int jend = n; const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, iend), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, jend), [&](const int &j) { - B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, iend), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, jend), + [&](const int &j) { B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; }); + }); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index 9f5f857e44..371dbb483c 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -34,32 +34,24 @@ namespace KokkosBatched { /// A(m x m), B(m x n) template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -70,32 +62,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; @@ -106,32 +90,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_1(), B.stride_0()); } }; @@ -142,32 +118,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_1(), B.stride_0()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), - B.stride_0()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(1), + B.extent(0), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_1(), B.stride_0()); } }; @@ -178,32 +146,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -214,32 +174,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftUpper::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftUpper::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; @@ -250,32 +202,24 @@ struct TeamTrsm -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; template -struct TeamTrsm { +struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { - return TeamTrsmInternalLeftLower::invoke( - member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, - A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), - B.stride_1()); + return TeamTrsmInternalLeftLower::invoke(member, ArgDiag::use_unit_diag, B.extent(0), + B.extent(1), alpha, A.data(), A.stride_1(), + A.stride_0(), B.data(), B.stride_0(), B.stride_1()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index a880186ae9..a1a7062809 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -35,29 +35,24 @@ namespace KokkosBatched { template struct TeamTrsmInternalLeftLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -65,27 +60,22 @@ TeamTrsmInternalLeftLower::invoke( int iend = m - p - 1; int jend = n; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = - iend ? B + (p + 1) * bs0 : NULL; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = iend ? B + (p + 1) * bs0 : NULL; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { - // assume layout right for batched computation - const int i = ij / jend, j = ij % jend; - B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { + // assume layout right for batched computation + const int i = ij / jend, j = ij % jend; + B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1]; + }); } } return 0; @@ -93,11 +83,9 @@ TeamTrsmInternalLeftLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -107,9 +95,7 @@ TeamTrsmInternalLeftLower::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; /// @@ -120,8 +106,7 @@ TeamTrsmInternalLeftLower::invoke( InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; const int tsize = member.team_size(); @@ -134,25 +119,22 @@ TeamTrsmInternalLeftLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), - [&](const int jj) { - // Made this non-const in order to WORKAROUND issue #349 - int j = jj * nb, qb = (j + nb) > jb ? np : nb; - if (use_unit_diag) - trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); - else - trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), [&](const int jj) { + // Made this non-const in order to WORKAROUND issue #349 + int j = jj * nb, qb = (j + nb) > jb ? np : nb; + if (use_unit_diag) + trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); + else + trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); + }); member.team_barrier(); // gemm update - TeamGemmInternal::invoke( - member, ib - p - pb, jb, pb, minus_one, Ap + pb * as0, as0, as1, Bp, - bs0, bs1, one, Bp + pb * bs0, bs0, bs1); + TeamGemmInternal::invoke(member, ib - p - pb, jb, pb, minus_one, Ap + pb * as0, as0, as1, + Bp, bs0, bs1, one, Bp + pb * bs0, bs0, bs1); } }; @@ -170,20 +152,17 @@ TeamTrsmInternalLeftLower::invoke( template struct TeamTrsmInternalLeftUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const bool use_unit_diag, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1); }; template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); @@ -191,9 +170,7 @@ TeamTrsmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -203,30 +180,27 @@ TeamTrsmInternalLeftUpper::invoke( int jend = n; const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT b1t = B + p * bs0; member.team_barrier(); if (!use_unit_diag) { const ValueType alpha11 = A[p * as0 + p * as1]; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, jend), - [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, jend), + [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { - int i, j; - if (KokkosKernels::Impl::kk_is_gpu_exec_space< - typename MemberType::execution_space>()) { - i = ij % iend; - j = ij / iend; - } else { - i = ij / jend; - j = ij % jend; - } - B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { + int i, j; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij % iend; + j = ij / iend; + } else { + i = ij / jend; + j = ij % jend; + } + B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; + }); } } return 0; @@ -234,11 +208,9 @@ TeamTrsmInternalLeftUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -TeamTrsmInternalLeftUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, +KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -248,16 +220,13 @@ TeamTrsmInternalLeftUpper::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, - bs1); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftUpperNonUnitDiag trsm_n(as0, as1, bs0, bs1); - auto trsm = [&](const int ib, const int jb, - const ValueType *KOKKOS_RESTRICT AA, + auto trsm = [&](const int ib, const int jb, const ValueType *KOKKOS_RESTRICT AA, /**/ ValueType *KOKKOS_RESTRICT BB) { const int mb = mbAlgo; //(ib <=5 ? ib : mbAlgo); const int tsize = member.team_size(); @@ -265,29 +234,25 @@ TeamTrsmInternalLeftUpper::invoke( int nb = (jb / tsize + jb % tsize > 0); int np = jb % nb; for (int pp = 0; pp < ib; pp += mb) { - const int ptmp = (ib - pp - mb), p = (ptmp < 0 ? 0 : ptmp), - pb = (mb + (ptmp < 0) * ptmp); + const int ptmp = (ib - pp - mb), p = (ptmp < 0 ? 0 : ptmp), pb = (mb + (ptmp < 0) * ptmp); // trsm update const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT Bp = BB + p * bs0; member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), - [&](const int &jj) { - const int j = jj * nb, qb = (j + nb) > jb ? np : nb; - if (use_unit_diag) - trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); - else - trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)), [&](const int &jj) { + const int j = jj * nb, qb = (j + nb) > jb ? np : nb; + if (use_unit_diag) + trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1); + else + trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1); + }); member.team_barrier(); // gemm update - TeamGemmInternal::invoke( - member, p, jb, pb, minus_one, Ap - p * as0, as0, as1, Bp, bs0, bs1, - one, BB, bs0, bs1); + TeamGemmInternal::invoke(member, p, jb, pb, minus_one, Ap - p * as0, as0, as1, Bp, bs0, + bs1, one, BB, bs0, bs1); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp index 0fc375a7b2..073970caa6 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp @@ -38,43 +38,32 @@ namespace KokkosBatched { /// L/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -84,28 +73,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; @@ -113,42 +94,31 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -159,27 +129,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; template struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; @@ -187,43 +150,32 @@ struct SerialTrsv { /// U/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ - defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, - (MKL_INT)vector_type::vector_length); + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, alpha, (const double *)A.data(), + A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; } @@ -233,28 +185,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; template -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(), - A.stride_1(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), b.data(), b.stride_0()); } }; @@ -262,42 +206,31 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { typedef typename bViewType::value_type vector_type; // typedef typename vector_type::value_type value_type; const int m = b.extent(0), n = 1; - static_assert(is_vector::value, - "value type is not vector type"); - static_assert( - vector_type::vector_length == 4 || vector_type::vector_length == 8, - "AVX, AVX2 and AVX512 is supported"); - const MKL_COMPACT_PACK format = - vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; // no error check int r_val = 0; if (A.stride_0() == 1) { - mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else if (A.stride_1() == 1) { - mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, - ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n, - alpha, (const double *)A.data(), A.stride_0(), - (double *)b.data(), b.stride_0(), format, + mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS, ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, alpha, (const double *)A.data(), A.stride_0(), (double *)b.data(), b.stride_0(), format, (MKL_INT)vector_type::vector_length); } else { r_val = -1; @@ -308,27 +241,20 @@ struct SerialTrsv -struct SerialTrsv { +struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; template struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const bViewType &b) { - return SerialTrsvInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const bViewType &b) { + return SerialTrsvInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp index 3ae206cc09..43d95377d4 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp @@ -38,39 +38,33 @@ namespace KokkosBatched { template struct SerialTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, - const int bs0); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, const int bs0); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsvInternalLower::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, - *KOKKOS_RESTRICT b2 = - iend ? beta1 + bs0 : NULL; + ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, *KOKKOS_RESTRICT b2 = iend ? beta1 + bs0 : NULL; // with KOKKOS_RESTRICT a compiler assumes that the pointer is not // accessed by others op(/=) uses this pointer and changes the associated @@ -85,10 +79,12 @@ SerialTrsvInternalLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); constexpr int mbAlgo = Algo::Trsv::Blocked::mb(); @@ -96,8 +92,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -110,7 +105,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp); @@ -118,9 +113,8 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( trsm_n.serial_invoke(Ap, pb, 1, bp); // gemv update - KokkosBlas::Impl::SerialGemvInternal::invoke( - m - p - pb, pb, minus_one, Ap + pb * as0, as0, as1, bp, bs0, one, - bp + pb * bs0, bs0); + KokkosBlas::Impl::SerialGemvInternal::invoke(m - p - pb, pb, minus_one, Ap + pb * as0, as0, + as1, bp, bs0, one, bp + pb * bs0, bs0); } } return 0; @@ -133,36 +127,33 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( template struct SerialTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, - const int bs0); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, const int bs0); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrsvInternalUpper::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; for (int p = (m - 1); p >= 0; --p) { const int iend = p; - const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; + const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; + /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; // with KOKKOS_RESTRICT a compiler assumes that the pointer is not // accessed by others op(/=) uses this pointer and changes the associated @@ -177,10 +168,12 @@ SerialTrsvInternalUpper::invoke( template <> template -KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( - const bool use_unit_diag, const int m, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { +KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke(const bool use_unit_diag, const int m, + const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT b, + const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); @@ -189,8 +182,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( if (alpha == zero) KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); @@ -198,12 +190,11 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( const int mb = mbAlgo; for (int pp = 0; pp < m; pp += mb) { - const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), - pb = (mb + (ptmp < 0) * ptmp); + const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), pb = (mb + (ptmp < 0) * ptmp); // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp); @@ -211,8 +202,8 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( trsm_n.serial_invoke(Ap, pb, 1, bp); // gemv update - KokkosBlas::Impl::SerialGemvInternal::invoke( - p, pb, minus_one, Ap - p * as0, as0, as1, bp, bs0, one, b, bs0); + KokkosBlas::Impl::SerialGemvInternal::invoke(p, pb, minus_one, Ap - p * as0, as0, as1, bp, + bs0, one, b, bs0); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp index 8e14b5ef37..42c242414c 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp @@ -38,16 +38,13 @@ namespace KokkosBatched { /// template -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + b.data(), b.stride_0()); } }; @@ -56,16 +53,13 @@ struct TeamVectorTrsv -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + b.data(), b.stride_0()); } }; @@ -74,16 +68,13 @@ struct TeamVectorTrsv -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(0), + alpha, A.data(), A.stride_0(), A.stride_1(), + b.data(), b.stride_0()); } }; @@ -92,16 +83,13 @@ struct TeamVectorTrsv -struct TeamVectorTrsv { +struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamVectorTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamVectorTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(1), + alpha, A.data(), A.stride_1(), A.stride_0(), + b.data(), b.stride_0()); } }; diff --git a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index 40bca5a64a..894e684ef2 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -36,12 +36,10 @@ namespace KokkosBatched { template struct TeamVectorTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -49,31 +47,24 @@ struct TeamVectorTrsvInternalLower { template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsvInternalLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsvInternalLower::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, - bs0); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, - *KOKKOS_RESTRICT b2 = - iend ? beta1 + bs0 : NULL; + ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, *KOKKOS_RESTRICT b2 = iend ? beta1 + bs0 : NULL; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -82,12 +73,10 @@ TeamVectorTrsvInternalLower::invoke( local_beta1 = local_beta1 / alpha11; member.team_barrier(); - Kokkos::single(Kokkos::PerTeam(member), - [&]() { *beta1 = local_beta1; }); + Kokkos::single(Kokkos::PerTeam(member), [&]() { *beta1 = local_beta1; }); } - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, iend), - [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, iend), + [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); } } return 0; @@ -100,12 +89,10 @@ TeamVectorTrsvInternalLower::invoke( template struct TeamVectorTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -113,28 +100,24 @@ struct TeamVectorTrsvInternalUpper { template <> template -KOKKOS_INLINE_FUNCTION int -TeamVectorTrsvInternalUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, +KOKKOS_INLINE_FUNCTION int TeamVectorTrsvInternalUpper::invoke( + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, - bs0); + if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; for (int p = (m - 1); p >= 0; --p) { const int iend = p; - const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; + const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; + /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -143,12 +126,10 @@ TeamVectorTrsvInternalUpper::invoke( local_beta1 = local_beta1 / alpha11; member.team_barrier(); - Kokkos::single(Kokkos::PerTeam(member), - [&]() { *beta1 = local_beta1; }); + Kokkos::single(Kokkos::PerTeam(member), [&]() { *beta1 = local_beta1; }); } - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, iend), - [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, iend), + [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp index 7f370c1f01..c658080dc2 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp @@ -38,30 +38,24 @@ namespace KokkosBatched { /// template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; @@ -70,30 +64,23 @@ struct TeamTrsv -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(1), alpha, + A.data(), A.stride_1(), A.stride_0(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(), - A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), + A.stride_1(), A.stride_0(), b.data(), b.stride_0()); } }; @@ -102,30 +89,24 @@ struct TeamTrsv -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalUpper::invoke( - member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), - A.stride_0(), A.stride_1(), b.data(), b.stride_0()); + return TeamTrsvInternalUpper::invoke(member, ArgDiag::use_unit_diag, A.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), b.data(), + b.stride_0()); } }; @@ -134,30 +115,24 @@ struct TeamTrsv -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(1), alpha, + A.data(), A.stride_1(), A.stride_0(), b.data(), + b.stride_0()); } }; template -struct TeamTrsv { +struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { - return TeamTrsvInternalLower::invoke( - member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), - A.stride_1(), A.stride_0(), b.data(), b.stride_0()); + return TeamTrsvInternalLower::invoke(member, ArgDiag::use_unit_diag, A.extent(1), alpha, + A.data(), A.stride_1(), A.stride_0(), b.data(), + b.stride_0()); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index 600a0c6e81..ba3b2ff7b5 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -38,12 +38,10 @@ namespace KokkosBatched { template struct TeamTrsvInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -52,28 +50,23 @@ struct TeamTrsvInternalLower { template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { const int iend = m - p - 1; - const ValueType *KOKKOS_RESTRICT a21 = - iend ? A + (p + 1) * as0 + p * as1 : NULL; + const ValueType *KOKKOS_RESTRICT a21 = iend ? A + (p + 1) * as0 + p * as1 : NULL; - ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, - *KOKKOS_RESTRICT b2 = - iend ? beta1 + bs0 : NULL; + ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0, *KOKKOS_RESTRICT b2 = iend ? beta1 + bs0 : NULL; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -85,9 +78,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( if (member.team_rank() == 0) *beta1 = local_beta1; } /// member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend), - [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), + [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; }); } } return 0; @@ -96,9 +88,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); @@ -107,8 +98,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -122,7 +112,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; member.team_barrier(); if (member.team_rank() == 0) { @@ -134,9 +124,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( // gemv update member.team_barrier(); - KokkosBlas::Impl::TeamGemvInternal::invoke( - member, m - p - pb, pb, minus_one, Ap + pb * as0, as0, as1, bp, 1, - one, bp + pb * bs0, bs0); + KokkosBlas::Impl::TeamGemvInternal::invoke(member, m - p - pb, pb, minus_one, Ap + pb * as0, + as0, as1, bp, 1, one, bp + pb * bs0, bs0); } } return 0; @@ -149,12 +138,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( template struct TeamTrsvInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const bool /*use_unit_diag*/, - const int /*m*/, const ScalarType /*alpha*/, - const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, - const int /*as1*/, - /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const bool /*use_unit_diag*/, const int /*m*/, + const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, + const int /*as0*/, const int /*as1*/, + /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -163,25 +150,23 @@ struct TeamTrsvInternalUpper { template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0); if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; for (int p = (m - 1); p >= 0; --p) { const int iend = p; - const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; - /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; + const ValueType *KOKKOS_RESTRICT a01 = A + p * as1; + /**/ ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0; member.team_barrier(); ValueType local_beta1 = *beta1; @@ -193,9 +178,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( if (member.team_rank() == 0) *beta1 = local_beta1; } // member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, iend), - [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend), + [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; }); } } return 0; @@ -204,9 +188,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( template <> template KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( - const MemberType &member, const bool use_unit_diag, const int m, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, + const MemberType &member, const bool use_unit_diag, const int m, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) { const ScalarType one(1.0), zero(0.0), minus_one(-1.0); @@ -216,8 +199,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( if (alpha == zero) KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); @@ -225,12 +207,11 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( const int mb = mbAlgo; for (int pp = 0; pp < m; pp += mb) { - const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), - pb = (mb + (ptmp < 0) * ptmp); + const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp), pb = (mb + (ptmp < 0) * ptmp); // trsm update const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1; - /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; + /**/ ValueType *KOKKOS_RESTRICT bp = b + p * bs0; member.team_barrier(); if (member.team_rank() == 0) { @@ -242,8 +223,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( // gemv update member.team_barrier(); - KokkosBlas::Impl::TeamGemvInternal::invoke( - member, p, pb, minus_one, Ap - p * as0, as0, as1, bp, 1, one, b, bs0); + KokkosBlas::Impl::TeamGemvInternal::invoke(member, p, pb, minus_one, Ap - p * as0, as0, + as1, bp, 1, one, b, bs0); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp index 66c8f91ac9..1068bf9e54 100644 --- a/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp @@ -25,18 +25,16 @@ template struct SerialTrtri { template KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A) { - return SerialTrtriInternalLower::invoke( - ArgDiag::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride_0(), A.stride_1()); + return SerialTrtriInternalLower::invoke(ArgDiag::use_unit_diag, A.extent(0), A.extent(1), + A.data(), A.stride_0(), A.stride_1()); } }; template struct SerialTrtri { template KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A) { - return SerialTrtriInternalUpper::invoke( - ArgDiag::use_unit_diag, A.extent(0), A.extent(1), A.data(), A.stride(0), - A.stride(1)); + return SerialTrtriInternalUpper::invoke(ArgDiag::use_unit_diag, A.extent(0), A.extent(1), + A.data(), A.stride(0), A.stride(1)); } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp index 2941b03ccf..f6b0b4bf6d 100644 --- a/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp @@ -25,27 +25,23 @@ namespace KokkosBatched { template struct SerialTrtriInternalLower { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int am, const int an, - ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int am, const int an, + ValueType *KOKKOS_RESTRICT A, const int as0, const int as1); }; template struct SerialTrtriInternalUpper { template - KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, - const int am, const int an, - ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1); + KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag, const int am, const int an, + ValueType *KOKKOS_RESTRICT A, const int as0, const int as1); }; template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrtriInternalLower::invoke( - const bool use_unit_diag, const int am, const int /*an*/, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { +KOKKOS_INLINE_FUNCTION int SerialTrtriInternalLower::invoke(const bool use_unit_diag, + const int am, const int /*an*/, + ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1) { ValueType one(1.0), zero(0.0), A_ii; if (!use_unit_diag) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) @@ -74,14 +70,13 @@ SerialTrtriInternalLower::invoke( int A_col_vec_m = am - i - 1, A_col_vec_n = 1; // TRMV/TRMM −− x=Ax // A((j+1):n,j) = A((j+1):n,(j+1):n) ∗ A((j+1):n,j) ; - SerialTrmmInternalLeftLower::invoke( - use_unit_diag, false, A_subblock_m, A_subblock_n, A_col_vec_m, - A_col_vec_n, one, A_subblock, as0, as1, A_col_vec, as0, as1); + SerialTrmmInternalLeftLower::invoke(use_unit_diag, false, A_subblock_m, A_subblock_n, + A_col_vec_m, A_col_vec_n, one, A_subblock, as0, as1, + A_col_vec, as0, as1); // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, - A_ii, A_col_vec, as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, as0, as1); } } return 0; @@ -89,10 +84,10 @@ SerialTrtriInternalLower::invoke( template <> template -KOKKOS_INLINE_FUNCTION int -SerialTrtriInternalUpper::invoke( - const bool use_unit_diag, const int am, const int /*an*/, - ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { +KOKKOS_INLINE_FUNCTION int SerialTrtriInternalUpper::invoke(const bool use_unit_diag, + const int am, const int /*an*/, + ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1) { ValueType one(1.0), zero(0.0), A_ii; if (!use_unit_diag) { @@ -123,14 +118,13 @@ SerialTrtriInternalUpper::invoke( // TRMV/TRMM −− x=Ax // A(1:(j-1),j) = A(1:(j-1),1:(j-1)) ∗ A(1:(j-1),j) ; // SerialTrmm - SerialTrmmInternalLeftUpper::invoke( - use_unit_diag, false, A_subblock_m, A_subblock_n, A_col_vec_m, - A_col_vec_n, one, A_subblock, as0, as1, A_col_vec, as0, as1); + SerialTrmmInternalLeftUpper::invoke(use_unit_diag, false, A_subblock_m, A_subblock_n, + A_col_vec_m, A_col_vec_n, one, A_subblock, as0, as1, + A_col_vec, as0, as1); // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, - A_ii, A_col_vec, as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, as0, as1); } } return 0; diff --git a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp index b57a145ccb..de5ecebf94 100644 --- a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp @@ -29,16 +29,13 @@ namespace KokkosBatched { template struct TeamVectorUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const AViewType &A, const pViewType &p, - const UViewType &U, const VViewType &V, const wViewType &w, - int &matrix_rank) { - return TeamVectorUTV_Internal::invoke( - member, A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), - p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), V.data(), - V.stride(0), V.stride(1), w.data(), matrix_rank); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const pViewType &p, + const UViewType &U, const VViewType &V, const wViewType &w, + int &matrix_rank) { + return TeamVectorUTV_Internal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), + p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), V.data(), + V.stride(0), V.stride(1), w.data(), matrix_rank); } }; diff --git a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp index 1066467414..e39dba9a40 100644 --- a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp @@ -32,15 +32,14 @@ namespace KokkosBatched { /// =================== struct TeamVectorUTV_Internal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, - const int n, // m = NumRows(A), n = NumCols(A) - /* */ ValueType *A, const int as0, const int as1, - /* */ IntType *p, const int ps0, - /* */ ValueType *U, const int us0, const int us1, - /* */ ValueType *V, const int vs0, const int vs1, - /* */ ValueType *w, // 3*m, tau, norm, householder workspace - /* */ int &matrix_rank) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, + const int n, // m = NumRows(A), n = NumCols(A) + /* */ ValueType *A, const int as0, const int as1, + /* */ IntType *p, const int ps0, + /* */ ValueType *U, const int us0, const int us1, + /* */ ValueType *V, const int vs0, const int vs1, + /* */ ValueType *w, // 3*m, tau, norm, householder workspace + /* */ int &matrix_rank) { typedef ValueType value_type; // typedef IntType int_type; @@ -51,25 +50,19 @@ struct TeamVectorUTV_Internal { value_type *work = w; matrix_rank = -1; - TeamVectorQR_WithColumnPivotingInternal ::invoke( - member, m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank); + TeamVectorQR_WithColumnPivotingInternal ::invoke(member, m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank); - TeamVectorQR_FormQ_Internal ::invoke(member, m, matrix_rank, matrix_rank, A, - as0, as1, t, ts0, U, us0, us1, work); + TeamVectorQR_FormQ_Internal ::invoke(member, m, matrix_rank, matrix_rank, A, as0, as1, t, ts0, U, us0, us1, work); member.team_barrier(); /// for rank deficient matrix if (matrix_rank < n) { const value_type zero(0); - TeamVectorSetLowerTriangularInternal ::invoke( - member, matrix_rank, matrix_rank, 1, zero, A, as0, as1); + TeamVectorSetLowerTriangularInternal ::invoke(member, matrix_rank, matrix_rank, 1, zero, A, as0, as1); - TeamVectorQR_Internal ::invoke(member, n, matrix_rank, A, as1, as0, t, - ts0, work); + TeamVectorQR_Internal ::invoke(member, n, matrix_rank, A, as1, as0, t, ts0, work); - TeamVectorQR_FormQ_Internal ::invoke(member, n, matrix_rank, matrix_rank, - A, as1, as0, t, ts0, V, vs1, vs0, - work); + TeamVectorQR_FormQ_Internal ::invoke(member, n, matrix_rank, matrix_rank, A, as1, as0, t, ts0, V, vs1, vs0, work); } return 0; diff --git a/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp b/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp index 54e2791dbb..3f56e71422 100644 --- a/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp @@ -30,9 +30,8 @@ namespace KokkosBatched { /// struct SerialUpdateGivensInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const Kokkos::pair &S, - /* */ Kokkos::pair &G) { + KOKKOS_INLINE_FUNCTION static int invoke(const Kokkos::pair &S, + /* */ Kokkos::pair &G) { const ValueType tmp = S.first * G.first - S.second * G.second; G.second = S.first * G.second + S.second * G.first; G.first = tmp; diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp index f87492ea5a..08628729bc 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp @@ -24,23 +24,21 @@ namespace KokkosBatched { #define KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) Vector, l> -#define KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) \ - Vector, l> & +#define KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) Vector, l> & /// simd, simd #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator+( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator+(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_add_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator+(const Vector >, 4> &a, - const Vector >, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator+( + const Vector >, 4> &a, const Vector >, 4> &b) { return _mm512_add_pd(a, b); } #endif @@ -48,16 +46,15 @@ operator+(const Vector >, 4> &a, #endif #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_add_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) -operator+(const Vector >, 2> &a, - const Vector >, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator+( + const Vector >, 2> &a, const Vector >, 2> &b) { return _mm256_add_pd(a, b); } #endif @@ -66,8 +63,8 @@ operator+(const Vector >, 2> &a, #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator+(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator+(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -80,24 +77,24 @@ operator+(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator+( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator+(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x + b.float2().x; r_val.y = a.float2().y + b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator+( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator+(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x + b.double2().x; r_val.y = a.double2().y + b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator+( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator+(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x + b.float4().x; r_val.y = a.float4().y + b.float4().y; @@ -106,8 +103,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator+( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x + b.double4().x; r_val.y = a.double4().y + b.double4().y; @@ -119,9 +116,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator+=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator+=( + Vector, l> &a, const Vector, l> &b) { a = a + b; return a; } @@ -129,37 +125,34 @@ operator+=(Vector, l> &a, const Vector, l> &b) { /// simd, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator+(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator+(const Vector, l> &a, + const T b) { return a + Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator+(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator+(const T a, + const Vector, l> &b) { return Vector, l>(a) + b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator+=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator+=( + Vector, l> &a, const T b) { a = a + b; return a; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator++(Vector, l> &a, int) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator++(Vector, l> &a, int) { Vector, l> a0 = a; a = a + typename Kokkos::ArithTraits::mag_type(1); return a0; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator++(Vector, l> &a) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator++( + Vector, l> &a) { a = a + typename Kokkos::ArithTraits::mag_type(1); return a; } @@ -167,23 +160,20 @@ operator++(Vector, l> &a) { /// simd complex, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const Vector >, l> &a, const T b) { return a + Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const T a, const Vector >, l> &b) { return Vector >, l>(a) + b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator+=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator+=( + Vector >, l> &a, const T b) { a = a + b; return a; } @@ -191,26 +181,20 @@ operator+=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const Vector >, l> &a, const Kokkos::complex b) { return a + Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator+(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator+( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) + b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator+=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator+=( + Vector >, l> &a, const Kokkos::complex b) { a = a + b; return a; } @@ -222,16 +206,15 @@ operator+=(Vector >, l> &a, #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator-( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator-(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_sub_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator-(const Vector >, 4> &a, - const Vector >, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator-( + const Vector >, 4> &a, const Vector >, 4> &b) { return _mm512_sub_pd(a, b); } #endif @@ -239,16 +222,15 @@ operator-(const Vector >, 4> &a, #endif #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_sub_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) -operator-(const Vector >, 2> &a, - const Vector >, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator-( + const Vector >, 2> &a, const Vector >, 2> &b) { return _mm256_sub_pd(a, b); } #endif @@ -257,8 +239,8 @@ operator-(const Vector >, 2> &a, #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -271,24 +253,24 @@ operator-(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator-( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator-(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x - b.float2().x; r_val.y = a.float2().y - b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator-( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator-(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x - b.double2().x; r_val.y = a.double2().y - b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator-( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator-(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x - b.float4().x; r_val.y = a.float4().y - b.float4().y; @@ -297,8 +279,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator-( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x - b.double4().x; r_val.y = a.double4().y - b.double4().y; @@ -309,8 +291,7 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const Vector, l> &a) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const Vector, l> &a) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -322,9 +303,8 @@ operator-(const Vector, l> &a) { } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator-=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator-=( + Vector, l> &a, const Vector, l> &b) { a = a - b; return a; } @@ -332,37 +312,34 @@ operator-=(Vector, l> &a, const Vector, l> &b) { /// simd, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const Vector, l> &a, + const T b) { return a - Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator-(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator-(const T a, + const Vector, l> &b) { return Vector, l>(a) - b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator-=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator-=( + Vector, l> &a, const T b) { a = a - b; return a; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator--(Vector, l> &a, int) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator--(Vector, l> &a, int) { Vector, l> a0 = a; a = a - typename Kokkos::ArithTraits::mag_type(1); return a0; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator--(Vector, l> &a) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator--( + Vector, l> &a) { a = a - typename Kokkos::ArithTraits::mag_type(1); return a; } @@ -370,23 +347,20 @@ operator--(Vector, l> &a) { /// simd complex, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const Vector >, l> &a, const T b) { return a - Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const T a, const Vector >, l> &b) { return Vector >, l>(a) - b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator-=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator-=( + Vector >, l> &a, const T b) { a = a - b; return a; } @@ -394,26 +368,20 @@ operator-=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const Vector >, l> &a, const Kokkos::complex b) { return a - Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator-(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator-( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) - b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator-=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator-=( + Vector >, l> &a, const Kokkos::complex b) { a = a - b; return a; } @@ -425,30 +393,25 @@ operator-=(Vector >, l> &a, #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator*( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator*(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_mul_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator - *(const Vector >, 4> &a, - const Vector >, 4> &b) { - const __m512d as = _mm512_permute_pd(a, 0x55), - br = _mm512_permute_pd(b, 0x00), - bi = _mm512_permute_pd(b, 0xff); +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator*( + const Vector >, 4> &a, const Vector >, 4> &b) { + const __m512d as = _mm512_permute_pd(a, 0x55), br = _mm512_permute_pd(b, 0x00), bi = _mm512_permute_pd(b, 0xff); #if defined(__FMA__) // latency 7, throughput 0.5 return _mm512_fmaddsub_pd(a, br, _mm512_mul_pd(as, bi)); #else - return _mm512_add_pd( - _mm512_mul_pd(a, br), - _mm512_castsi512_pd(_mm512_xor_si512( - _mm512_castpd_si512(_mm512_mul_pd(as, bi)), - _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( - _mm512_setzero_pd(), 0x55, _mm256_set1_pd(-0.0)))))); + return _mm512_add_pd(_mm512_mul_pd(a, br), + _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(_mm512_mul_pd(as, bi)), + _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( + _mm512_setzero_pd(), 0x55, _mm256_set1_pd(-0.0)))))); // const __mm512d cc = _mm512_mul_pd(as, bi); // return _mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a, br), 0x55, // cc), 0xaa, cc); @@ -459,25 +422,21 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator #endif #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_mul_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static Vector >, 2> operator*( - const Vector >, 2> &a, - const Vector >, 2> &b) { - const __m256d as = _mm256_permute_pd(a, 0x5), br = _mm256_permute_pd(b, 0x0), - bi = _mm256_permute_pd(b, 0xf); +static Vector >, 2> operator*(const Vector >, 2> &a, + const Vector >, 2> &b) { + const __m256d as = _mm256_permute_pd(a, 0x5), br = _mm256_permute_pd(b, 0x0), bi = _mm256_permute_pd(b, 0xf); #if defined(__FMA__) return _mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)); #else - return _mm256_add_pd(_mm256_mul_pd(a, br), - _mm256_xor_pd(_mm256_mul_pd(as, bi), - _mm256_set_pd(0.0, -0.0, 0.0, -0.0))); + return _mm256_add_pd(_mm256_mul_pd(a, br), _mm256_xor_pd(_mm256_mul_pd(as, bi), _mm256_set_pd(0.0, -0.0, 0.0, -0.0))); #endif } #endif @@ -486,8 +445,8 @@ static Vector >, 2> operator*( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator*(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator*(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -500,24 +459,24 @@ operator*(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator*( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator*(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x * b.float2().x; r_val.y = a.float2().y * b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator*( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator*(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x * b.double2().x; r_val.y = a.double2().y * b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator*( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator*(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x * b.float4().x; r_val.y = a.float4().y * b.float4().y; @@ -526,8 +485,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator*( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x * b.double4().x; r_val.y = a.double4().y * b.double4().y; @@ -538,9 +497,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator*=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator*=( + Vector, l> &a, const Vector, l> &b) { a = a * b; return a; } @@ -548,21 +506,20 @@ operator*=(Vector, l> &a, const Vector, l> &b) { /// simd, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator*(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator*(const Vector, l> &a, + const T b) { return a * Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator*(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator*(const T a, + const Vector, l> &b) { return Vector, l>(a) * b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator*=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator*=( + Vector, l> &a, const T b) { a = a * b; return a; } @@ -585,8 +542,8 @@ operator*(const Vector >, 4> &a, const double b) { #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator - *(const Vector >, 2> &a, const double b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator*( + const Vector >, 2> &a, const double b) { return _mm256_mul_pd(a, _mm256_set1_pd(b)); } #endif @@ -595,9 +552,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const Vector >, l> &a, const T b) { return a * Vector >, l>(b); } @@ -617,8 +573,8 @@ operator*(const double a, const Vector >, 4> &b) { #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator - *(const double a, const Vector >, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator*( + const double a, const Vector >, 2> &b) { return _mm256_mul_pd(_mm256_set1_pd(a), b); } #endif @@ -627,16 +583,14 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const T a, const Vector >, l> &b) { return Vector >, l>(a) * b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator*=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator*=( + Vector >, l> &a, const T b) { a = a * b; return a; } @@ -644,26 +598,20 @@ operator*=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const Vector >, l> &a, const Kokkos::complex b) { return a * Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator*(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator*( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) * b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator*=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator*=( + Vector >, l> &a, const Kokkos::complex b) { a = a * b; return a; } @@ -675,36 +623,30 @@ operator*=(Vector >, l> &a, #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX512F__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator/( - const Vector, 8> &a, const Vector, 8> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator/(const Vector, 8> &a, + const Vector, 8> &b) { return _mm512_div_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator/(const Vector >, 4> &a, - const Vector >, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator/( + const Vector >, 4> &a, const Vector >, 4> &b) { const __m512d as = _mm512_permute_pd(a, 0x55), cb = _mm512_castsi512_pd(_mm512_xor_si512( _mm512_castpd_si512(b), - _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( - _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0))))), - br = _mm512_permute_pd(cb, 0x00), - bi = _mm512_permute_pd(cb, 0xff); + _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0))))), + br = _mm512_permute_pd(cb, 0x00), bi = _mm512_permute_pd(cb, 0xff); #if defined(__FMA__) return _mm512_div_pd(_mm512_fmaddsub_pd(a, br, _mm512_mul_pd(as, bi)), _mm512_fmadd_pd(br, br, _mm512_mul_pd(bi, bi))); #else - return _mm512_div_pd( - _mm512_add_pd( - _mm512_mul_pd(a, br), - _mm512_castsi512_pd(_mm512_xor_si512( - _mm512_castpd_si512(_mm512_mul_pd(as, bi)), - _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( - _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0)))))), - _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi, bi))); + return _mm512_div_pd(_mm512_add_pd(_mm512_mul_pd(a, br), _mm512_castsi512_pd(_mm512_xor_si512( + _mm512_castpd_si512(_mm512_mul_pd(as, bi)), + _mm512_castpd_si512(_mm512_mask_broadcast_f64x4( + _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0)))))), + _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi, bi))); // const __mm512d cc = _mm512_mul_pd(as, bi); // return _mm512_div_pd(_mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a, // br), 0x55, cc), 0xaa, cc), @@ -718,30 +660,24 @@ operator/(const Vector >, 4> &a, #if defined(__AVX__) || defined(__AVX2__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/(const Vector, 4> &a, + const Vector, 4> &b) { return _mm256_div_pd(a, b); } #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) -operator/(Vector >, 2> const &a, - Vector >, 2> const &b) { - const __m256d as = _mm256_permute_pd(a, 0x5), - cb = _mm256_xor_pd(b, _mm256_set_pd(-0.0, 0.0, -0.0, 0.0)), - br = _mm256_permute_pd(cb, 0x0), - bi = _mm256_permute_pd(cb, 0xf); +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 2) operator/( + Vector >, 2> const &a, Vector >, 2> const &b) { + const __m256d as = _mm256_permute_pd(a, 0x5), cb = _mm256_xor_pd(b, _mm256_set_pd(-0.0, 0.0, -0.0, 0.0)), + br = _mm256_permute_pd(cb, 0x0), bi = _mm256_permute_pd(cb, 0xf); #if defined(__FMA__) - return _mm256_div_pd( - _mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)), - _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi))); + return _mm256_div_pd(_mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)), + _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi))); #else return _mm256_div_pd( - _mm256_add_pd(_mm256_mul_pd(a, br), - _mm256_xor_pd(_mm256_mul_pd(as, bi), - _mm256_set_pd(0.0, -0.0, 0.0, -0.0))), + _mm256_add_pd(_mm256_mul_pd(a, br), _mm256_xor_pd(_mm256_mul_pd(as, bi), _mm256_set_pd(0.0, -0.0, 0.0, -0.0))), _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi))); #endif } @@ -751,8 +687,8 @@ operator/(Vector >, 2> const &a, #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator/(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator/(const Vector, l> &a, + const Vector, l> &b) { Vector, l> r_val; if (std::is_fundamental::value) { KOKKOSKERNELS_FORCE_SIMD @@ -765,24 +701,24 @@ operator/(const Vector, l> &a, const Vector, l> &b) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator/( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator/(const Vector, 2> &a, + const Vector, 2> &b) { float2 r_val; r_val.x = a.float2().x / b.float2().x; r_val.y = a.float2().y / b.float2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator/( - const Vector, 2> &a, const Vector, 2> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator/(const Vector, 2> &a, + const Vector, 2> &b) { double2 r_val; r_val.x = a.double2().x / b.double2().x; r_val.y = a.double2().y / b.double2().y; return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator/( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator/(const Vector, 4> &a, + const Vector, 4> &b) { float4 r_val; r_val.x = a.float4().x / b.float4().x; r_val.y = a.float4().y / b.float4().y; @@ -791,8 +727,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator/( return r_val; } KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/( - const Vector, 4> &a, const Vector, 4> &b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/(const Vector, 4> &a, + const Vector, 4> &b) { double4 r_val; r_val.x = a.double4().x / b.double4().x; r_val.y = a.double4().y / b.double4().y; @@ -803,9 +739,8 @@ static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/( #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator/=(Vector, l> &a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator/=( + Vector, l> &a, const Vector, l> &b) { a = a / b; return a; } @@ -816,8 +751,8 @@ operator/=(Vector, l> &a, const Vector, l> &b) { #if !defined(KOKKOS_COMPILER_GNU) KOKKOS_FORCEINLINE_FUNCTION -static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) -operator/(const Vector >, 4> &a, const double b) { +static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, 4) operator/( + const Vector >, 4> &a, const double b) { return _mm512_div_pd(a, _mm512_set1_pd(b)); } #endif @@ -826,21 +761,20 @@ operator/(const Vector >, 4> &a, const double b) { #endif template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator/(const Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator/(const Vector, l> &a, + const T b) { return a / Vector, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) -operator/(const T a, const Vector, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator/(const T a, + const Vector, l> &b) { return Vector, l>(a) / b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - T, l) -operator/=(Vector, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) operator/=( + Vector, l> &a, const T b) { a = a / b; return a; } @@ -848,23 +782,20 @@ operator/=(Vector, l> &a, const T b) { /// simd complex, real template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const Vector >, l> &a, const T b) { return a / Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const T a, const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const T a, const Vector >, l> &b) { return Vector >, l>(a) / b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator/=(Vector >, l> &a, const T b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator/=( + Vector >, l> &a, const T b) { a = a / b; return a; } @@ -872,26 +803,20 @@ operator/=(Vector >, l> &a, const T b) { /// simd complex, complex template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const Vector >, l> &a, const Kokkos::complex b) { return a / Vector >, l>(b); } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE( - Kokkos::complex, l) -operator/(const Kokkos::complex a, - const Vector >, l> &b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex, l) operator/( + const Kokkos::complex a, const Vector >, l> &b) { return Vector >, l>(a) / b; } template -KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( - Kokkos::complex, l) -operator/=(Vector >, l> &a, - const Kokkos::complex b) { +KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex, l) operator/=( + Vector >, l> &a, const Kokkos::complex b) { a = a / b; return a; } diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp index c8c07e97c4..f289d5be09 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp @@ -22,16 +22,13 @@ namespace KokkosBatched { -#define KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) \ - typename std::enable_if::value && \ - std::is_integral::value, \ +#define KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) \ + typename std::enable_if::value && std::is_integral::value, \ const Vector, l> >::type template -KOKKOS_INLINE_FUNCTION static - typename std::enable_if::value, - const Vector, l> >::type - operator!(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static typename std::enable_if::value, const Vector, l> >::type +operator!(const Vector, l> &a) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -44,9 +41,8 @@ KOKKOS_INLINE_FUNCTION static } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator||(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator||( + const Vector, l> &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -59,9 +55,8 @@ operator||(const Vector, l> &a, const Vector, l> &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator&&(const Vector, l> &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator&&( + const Vector, l> &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -74,9 +69,8 @@ operator&&(const Vector, l> &a, const Vector, l> &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator||(const Vector, l> &a, const T1 &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator||( + const Vector, l> &a, const T1 &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -89,9 +83,8 @@ operator||(const Vector, l> &a, const T1 &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator&&(const Vector, l> &a, const T1 &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator&&( + const Vector, l> &a, const T1 &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -104,9 +97,8 @@ operator&&(const Vector, l> &a, const T1 &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator||(const T0 &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator||( + const T0 &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -119,9 +111,8 @@ operator||(const T0 &a, const Vector, l> &b) { } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, - T1, l) -operator&&(const T0 &a, const Vector, l> &b) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) operator&&( + const T0 &a, const Vector, l> &b) { Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp index 69bbb53c6b..eefaf4ce0d 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp @@ -24,14 +24,12 @@ namespace KokkosBatched { #define KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) Vector, l> #define KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) \ - typename std::enable_if::value, \ - Vector, l> >::type + typename std::enable_if::value, Vector, l> >::type /// simd template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - sqrt(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) sqrt(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -46,8 +44,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - cbrt(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) cbrt(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -62,8 +59,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - log(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -78,8 +74,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - log10(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log10(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -94,8 +89,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) - exp(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) exp(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -138,8 +132,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - sin(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sin(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -154,8 +147,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - cos(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cos(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -170,8 +162,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - tan(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tan(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -186,8 +177,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - sinh(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sinh(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -202,8 +192,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - cosh(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cosh(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -218,8 +207,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - tanh(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tanh(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -234,8 +222,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - asin(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) asin(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -250,8 +237,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - acos(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) acos(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -266,8 +252,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) - atan(const Vector, l> &a) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) atan(const Vector, l> &a) { typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp index a95a752779..02f717d458 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp @@ -30,17 +30,13 @@ namespace KokkosBatched { // scalar, scalar template -KOKKOS_INLINE_FUNCTION static T conditional_assign(const bool cond, - const T &if_true_val, - const T &if_false_val) { +KOKKOS_INLINE_FUNCTION static T conditional_assign(const bool cond, const T &if_true_val, const T &if_false_val) { return cond ? if_true_val : if_false_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) - conditional_assign(/* */ T0 &r_val, const bool cond, const T1 &if_true_val, - const T2 &if_false_val) { +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ T0 &r_val, const bool cond, const T1 &if_true_val, const T2 &if_false_val) { r_val = cond ? if_true_val : if_false_val; } @@ -48,23 +44,18 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TY template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l) - conditional_assign(const Vector, l> &cond, - const Vector, l> &if_true_val, + conditional_assign(const Vector, l> &cond, const Vector, l> &if_true_val, const T &if_false_val) { Vector, l> r_val; - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val; return r_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) conditional_assign(/* */ Vector, l> &r_val, - const Vector, l> &cond, - const Vector, l> &if_true_val, - const T2 &if_false_val) { - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val; +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ Vector, l> &r_val, const Vector, l> &cond, + const Vector, l> &if_true_val, const T2 &if_false_val) { + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val; } // scalar, vector @@ -74,74 +65,57 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l) conditional_assign(const Vector, l> &cond, const T &if_true_val, const Vector, l> &if_false_val) { Vector, l> r_val; - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val : if_false_val[i]; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val : if_false_val[i]; return r_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) - conditional_assign(/* */ Vector, l> &r_val, - const Vector, l> &cond, const T1 &if_true_val, +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ Vector, l> &r_val, const Vector, l> &cond, const T1 &if_true_val, const Vector, l> &if_false_val) { - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val : if_false_val[i]; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val : if_false_val[i]; } // vector, vector template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l) - conditional_assign(const Vector, l> &cond, - const Vector, l> &if_true_val, + conditional_assign(const Vector, l> &cond, const Vector, l> &if_true_val, const Vector, l> &if_false_val) { Vector, l> r_val; - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; return r_val; } template -KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE( - T0, T1, T2, l) conditional_assign(/* */ Vector, l> &r_val, - const Vector, l> &cond, - const Vector, l> &if_true_val, - const Vector, l> &if_false_val) { - for (int i = 0; i < l; ++i) - r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; +KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) + conditional_assign(/* */ Vector, l> &r_val, const Vector, l> &cond, + const Vector, l> &if_true_val, const Vector, l> &if_false_val) { + for (int i = 0; i < l; ++i) r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i]; } template -KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, - const BinaryOp &func) { +KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, const BinaryOp &func) { T r_val = val[0]; for (int i = 1; i < l; ++i) r_val = func(r_val, val[i]); return r_val; } template -KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, - const BinaryOp &func, const T init) { +KOKKOS_INLINE_FUNCTION static T reduce(const Vector, l> &val, const BinaryOp &func, const T init) { T r_val = init; for (int i = 0; i < l; ++i) r_val = func(r_val, val[i]); return r_val; } template -KOKKOS_INLINE_FUNCTION static bool is_all_true( - const Vector, l> &cond) { - return reduce(cond, [](const bool left, const bool right) -> bool { - return (left && right); - }); +KOKKOS_INLINE_FUNCTION static bool is_all_true(const Vector, l> &cond) { + return reduce(cond, [](const bool left, const bool right) -> bool { return (left && right); }); } template -KOKKOS_INLINE_FUNCTION static bool is_any_true( - const Vector, l> &cond) { - return reduce(cond, [](const bool left, const bool right) -> bool { - return left || right; - }); +KOKKOS_INLINE_FUNCTION static bool is_any_true(const Vector, l> &cond) { + return reduce(cond, [](const bool left, const bool right) -> bool { return left || right; }); } template diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp index d49c6f35f9..c956780192 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp @@ -25,13 +25,13 @@ namespace KokkosBatched { // vector, vector #undef KOKKOSBATCHED_RELATION_OPERATOR -#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ - template \ - KOKKOS_INLINE_FUNCTION const Vector, l> operator op( \ - const Vector, l> &a, const Vector, l> &b) { \ - Vector, l> r_val; \ - for (int i = 0; i < l; ++i) r_val[i] = a[i] op b[i]; \ - return r_val; \ +#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ + template \ + KOKKOS_INLINE_FUNCTION const Vector, l> operator op(const Vector, l> &a, \ + const Vector, l> &b) { \ + Vector, l> r_val; \ + for (int i = 0; i < l; ++i) r_val[i] = a[i] op b[i]; \ + return r_val; \ } KOKKOSBATCHED_RELATION_OPERATOR(<) @@ -43,13 +43,12 @@ KOKKOSBATCHED_RELATION_OPERATOR(!=) // vector, scalar #undef KOKKOSBATCHED_RELATION_OPERATOR -#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ - template \ - KOKKOS_INLINE_FUNCTION const Vector, l> operator op( \ - const Vector, l> &a, const T2 &b) { \ - Vector, l> r_val; \ - for (int i = 0; i < l; ++i) r_val[i] = a[i] op b; \ - return r_val; \ +#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ + template \ + KOKKOS_INLINE_FUNCTION const Vector, l> operator op(const Vector, l> &a, const T2 &b) { \ + Vector, l> r_val; \ + for (int i = 0; i < l; ++i) r_val[i] = a[i] op b; \ + return r_val; \ } KOKKOSBATCHED_RELATION_OPERATOR(<) @@ -61,13 +60,12 @@ KOKKOSBATCHED_RELATION_OPERATOR(!=) // scalar, vector #undef KOKKOSBATCHED_RELATION_OPERATOR -#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ - template \ - KOKKOS_INLINE_FUNCTION const Vector, l> operator op( \ - const T1 &a, const Vector, l> &b) { \ - Vector, l> r_val; \ - for (int i = 0; i < l; ++i) r_val[i] = a op b[i]; \ - return r_val; \ +#define KOKKOSBATCHED_RELATION_OPERATOR(op) \ + template \ + KOKKOS_INLINE_FUNCTION const Vector, l> operator op(const T1 &a, const Vector, l> &b) { \ + Vector, l> r_val; \ + for (int i = 0; i < l; ++i) r_val[i] = a op b[i]; \ + return r_val; \ } KOKKOSBATCHED_RELATION_OPERATOR(<) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp index 3fb7ac872b..60e5e43e57 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp @@ -63,52 +63,38 @@ struct SimdViewAccess { } template - KOKKOS_INLINE_FUNCTION constexpr - typename std::enable_if::value, size_t>::type - extent(const iType &r) const { + KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if::value, size_t>::type extent( + const iType &r) const { return _a.extent(r) * (r == PackDim::value ? vector_length : 1); } template - KOKKOS_INLINE_FUNCTION constexpr - typename std::enable_if::value, int>::type - extent_int(const iType &r) const { - return static_cast(_a.extent(r) * - (r == PackDim::value ? vector_length : 1)); + KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if::value, int>::type extent_int( + const iType &r) const { + return static_cast(_a.extent(r) * (r == PackDim::value ? vector_length : 1)); } - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return (_a.size() * vector_length); - } + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return (_a.size() * vector_length); } - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { - return _a.span() * vector_length; - } - KOKKOS_INLINE_FUNCTION constexpr bool span_span_is_contiguous() const { - return _a.span_span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return _a.data(); - } + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return _a.span() * vector_length; } + KOKKOS_INLINE_FUNCTION constexpr bool span_span_is_contiguous() const { return _a.span_span_is_contiguous(); } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return _a.data(); } /// rank 0 /// this does not make sense as this is flat view to simd view /// rank 1 template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && 1 == ViewType::rank, - reference_type> - operator()(const I0 &i0, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && 1 == ViewType::rank, reference_type> + operator()(const I0 &i0, Args... /*args*/) const { return _a(i0 / vector_length)[i0 % vector_length]; } /// rank 2 template KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t && - 2 == ViewType::rank, - reference_type> + std::enable_if_t && 2 == ViewType::rank, reference_type> operator()(const I0 &i0, const I1 &i1, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1)[i0 % vector_length]; @@ -120,11 +106,9 @@ struct SimdViewAccess { /// rank 3 template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 3 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && 3 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2)[i0 % vector_length]; case 1: return _a(i0, i1 / vector_length, i2)[i1 % vector_length]; @@ -137,11 +121,8 @@ struct SimdViewAccess { /// rank 4 template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 4 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - Args... /*args*/) const { + KokkosKernels::Impl::are_integral_v && 4 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3)[i0 % vector_length]; case 1: return _a(i0, i1 / vector_length, i2, i3)[i1 % vector_length]; @@ -153,14 +134,10 @@ struct SimdViewAccess { } /// rank 5 - template + template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 5 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, Args... /*args*/) const { + KokkosKernels::Impl::are_integral_v && 5 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3, i4)[i0 % vector_length]; case 1: return _a(i0, i1 / vector_length, i2, i3, i4)[i1 % vector_length]; @@ -173,25 +150,17 @@ struct SimdViewAccess { } /// rank 6 - template + template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - KokkosKernels::Impl::are_integral_v && - 6 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, Args... /*args*/) const { + KokkosKernels::Impl::are_integral_v && 6 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, + Args... /*args*/) const { switch (PackDim::value) { - case 0: - return _a(i0 / vector_length, i1, i2, i3, i4, i5)[i0 % vector_length]; - case 1: - return _a(i0, i1 / vector_length, i2, i3, i4, i5)[i1 % vector_length]; - case 2: - return _a(i0, i1, i2 / vector_length, i3, i4, i5)[i2 % vector_length]; - case 3: - return _a(i0, i1, i2, i3 / vector_length, i4, i5)[i3 % vector_length]; - case 4: - return _a(i0, i1, i2, i3, i4 / vector_length, i5)[i4 % vector_length]; + case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5)[i0 % vector_length]; + case 1: return _a(i0, i1 / vector_length, i2, i3, i4, i5)[i1 % vector_length]; + case 2: return _a(i0, i1, i2 / vector_length, i3, i4, i5)[i2 % vector_length]; + case 3: return _a(i0, i1, i2, i3 / vector_length, i4, i5)[i3 % vector_length]; + case 4: return _a(i0, i1, i2, i3, i4 / vector_length, i5)[i4 % vector_length]; case 5: break; default: break; } @@ -199,35 +168,18 @@ struct SimdViewAccess { } /// rank 7 - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t && - 7 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, - Args... /*args*/) const { + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 7 == ViewType::rank, reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, const I6 &i6, + Args... /*args*/) const { switch (PackDim::value) { - case 0: - return _a(i0 / vector_length, i1, i2, i3, i4, i5, - i6)[i0 % vector_length]; - case 1: - return _a(i0, i1 / vector_length, i2, i3, i4, i5, - i6)[i1 % vector_length]; - case 2: - return _a(i0, i1, i2 / vector_length, i3, i4, i5, - i6)[i2 % vector_length]; - case 3: - return _a(i0, i1, i2, i3 / vector_length, i4, i5, - i6)[i3 % vector_length]; - case 4: - return _a(i0, i1, i2, i3, i4 / vector_length, i5, - i6)[i4 % vector_length]; - case 5: - return _a(i0, i1, i2, i3, i4, i5 / vector_length, - i6)[i5 % vector_length]; + case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6)[i0 % vector_length]; + case 1: return _a(i0, i1 / vector_length, i2, i3, i4, i5, i6)[i1 % vector_length]; + case 2: return _a(i0, i1, i2 / vector_length, i3, i4, i5, i6)[i2 % vector_length]; + case 3: return _a(i0, i1, i2, i3 / vector_length, i4, i5, i6)[i3 % vector_length]; + case 4: return _a(i0, i1, i2, i3, i4 / vector_length, i5, i6)[i4 % vector_length]; + case 5: return _a(i0, i1, i2, i3, i4, i5 / vector_length, i6)[i5 % vector_length]; case 6: break; default: break; } @@ -235,43 +187,25 @@ struct SimdViewAccess { } /// rank 8 - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t && - 8 == ViewType::rank, - reference_type> - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7, - Args... /*args*/) const { + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 8 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, const I6 &i6, + const I7 &i7, Args... /*args*/) const { switch (PackDim::value) { - case 0: - return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6, - i7)[i0 % vector_length]; - case 1: - return _a(i0, i1 / vector_length, i2, i3, i4, i5, i6, - i7)[i1 % vector_length]; - case 2: - return _a(i0, i1, i2 / vector_length, i3, i4, i5, i6, - i7)[i2 % vector_length]; - case 3: - return _a(i0, i1, i2, i3 / vector_length, i4, i5, i6, - i7)[i3 % vector_length]; - case 4: - return _a(i0, i1, i2, i3, i4 / vector_length, i5, i6, - i7)[i4 % vector_length]; - case 5: - return _a(i0, i1, i2, i3, i4, i5 / vector_length, i6, - i7)[i5 % vector_length]; - case 6: - return _a(i0, i1, i2, i3, i4, i5, i6 / vector_length, - i7)[i6 % vector_length]; + case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6, i7)[i0 % vector_length]; + case 1: return _a(i0, i1 / vector_length, i2, i3, i4, i5, i6, i7)[i1 % vector_length]; + case 2: return _a(i0, i1, i2 / vector_length, i3, i4, i5, i6, i7)[i2 % vector_length]; + case 3: return _a(i0, i1, i2, i3 / vector_length, i4, i5, i6, i7)[i3 % vector_length]; + case 4: return _a(i0, i1, i2, i3, i4 / vector_length, i5, i6, i7)[i4 % vector_length]; + case 5: return _a(i0, i1, i2, i3, i4, i5 / vector_length, i6, i7)[i5 % vector_length]; + case 6: return _a(i0, i1, i2, i3, i4, i5, i6 / vector_length, i7)[i6 % vector_length]; case 7: break; default: break; } - return _a(i0, i1, i2, i3, i4, i5, i6, - i7 / vector_length)[i7 % vector_length]; + return _a(i0, i1, i2, i3, i4, i5, i6, i7 / vector_length)[i7 % vector_length]; } }; } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp index 0d3a9b3df9..a23a9ea4d0 100644 --- a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp @@ -29,12 +29,10 @@ namespace KokkosBatched { /// struct SerialWilkinsonShiftInternal { template - KOKKOS_INLINE_FUNCTION static int invoke( - const ValueType a, const ValueType b, const ValueType c, - const ValueType d, - /* */ Kokkos::complex* lambda1, - /* */ Kokkos::complex* lambda2, - /* */ bool* is_complex) { + KOKKOS_INLINE_FUNCTION static int invoke(const ValueType a, const ValueType b, const ValueType c, const ValueType d, + /* */ Kokkos::complex* lambda1, + /* */ Kokkos::complex* lambda2, + /* */ bool* is_complex) { /// compute eigenvalues of 2x2 system [a b; /// c d] /// when the system has a real complex values, diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index 52e1425041..988bd30c93 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -27,11 +27,9 @@ namespace KokkosBatched { /// ==================== struct SerialXpayInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -44,10 +42,9 @@ struct SerialXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -60,17 +57,14 @@ struct SerialXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, - const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, - const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha, + const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0, + const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (xs0 > xs1) - for (int i = 0; i < m; ++i) - invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + for (int i = 0; i < m; ++i) invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); else - for (int j = 0; j < n; ++j) - invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + for (int j = 0; j < n; ++j) invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); return 0; } @@ -81,12 +75,9 @@ struct SerialXpayInternal { /// ==================== struct TeamXpayInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha; Y[i * ys0] += X[i * xs0]; @@ -96,11 +87,10 @@ struct TeamXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha[i * alphas0]; Y[i * ys0] += X[i * xs0]; @@ -110,23 +100,18 @@ struct TeamXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int& i) { - SerialXpayInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, - Y + i * ys0, ys1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) { + SerialXpayInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int& j) { - SerialXpayInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, - Y + j * ys1, ys0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int& j) { + SerialXpayInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0); + }); } // member.team_barrier(); return 0; @@ -138,12 +123,9 @@ struct TeamXpayInternal { /// ======================== struct TeamVectorXpayInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const int m, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT X, - const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, - const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha; Y[i * ys0] += X[i * xs0]; @@ -153,11 +135,10 @@ struct TeamVectorXpayInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) { Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { Y[i * ys0] *= alpha[i * alphas0]; Y[i * ys0] += X[i * xs0]; @@ -166,20 +147,17 @@ struct TeamVectorXpayInternal { return 0; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const int m, const int n, - const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, - const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, - /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), - [&](const int& iTemp) { - int i, j; - getIndices(iTemp, n, m, j, i); - Y[i * ys0 + j * ys1] *= alpha[i * alphas0]; - Y[i * ys0 + j * ys1] += X[i * xs0 + j * xs1]; - }); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const int n, + const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0, + const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1, + /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) { + int i, j; + getIndices(iTemp, n, m, j, i); + Y[i * ys0 + j * ys1] *= alpha[i * alphas0]; + Y[i * ys0 + j * ys1] += X[i * xs0 + j * xs1]; + }); // member.team_barrier(); return 0; } @@ -189,18 +167,12 @@ struct TeamVectorXpayInternal { /// Serial Impl /// =========== template -KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, - const ViewType& X, - const ViewType& Y) { +KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, const ViewType& X, const ViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: ViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::rank == 2, - "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::xpay: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -219,11 +191,10 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, } #endif - return SerialXpayInternal::template invoke< - typename alphaViewType::non_const_value_type, - typename ViewType::non_const_value_type>( - X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), - X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1()); + return SerialXpayInternal::template invoke( + X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -232,18 +203,13 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, template template -KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( - const MemberType& member, const alphaViewType& alpha, const ViewType& X, - const ViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamXpay::invoke(const MemberType& member, const alphaViewType& alpha, + const ViewType& X, const ViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: ViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::rank == 2, - "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::xpay: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -262,12 +228,10 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( } #endif - return TeamXpayInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ViewType::non_const_value_type>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), - Y.stride_1()); + return TeamXpayInternal::template invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } /// @@ -276,18 +240,13 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( template template -KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( - const MemberType& member, const alphaViewType& alpha, const ViewType& X, - const ViewType& Y) { +KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke(const MemberType& member, const alphaViewType& alpha, + const ViewType& X, const ViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: ViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::rank == 2, - "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::xpay: alphaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { @@ -306,12 +265,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( } #endif - return TeamVectorXpayInternal::invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ViewType::non_const_value_type, typename ViewType::array_layout>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), - Y.stride_1()); + return TeamVectorXpayInternal::invoke( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(), X.stride_0(), X.stride_1(), Y.data(), + Y.stride_0(), Y.stride_1()); } } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp b/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp index 6b75a11dc7..7eadc43269 100644 --- a/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp +++ b/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp @@ -34,8 +34,7 @@ namespace KokkosBatched { struct SerialAddRadial { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType tiny, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType tiny, const AViewType &A); }; /// @@ -45,9 +44,7 @@ struct SerialAddRadial { template struct TeamAddRadial { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType tiny, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType tiny, const AViewType &A); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp index 3fe51f3138..bee7d3a645 100644 --- a/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp @@ -29,21 +29,16 @@ namespace KokkosBatched { // level 1 operation template struct SerialApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const uViewType &u2, - const tauViewType &tau, + template + KOKKOS_INLINE_FUNCTION static int invoke(const uViewType &u2, const tauViewType &tau, const AViewType const wViewType &w); }; // level 1 operation template struct TeamVectorApplyHouseholder { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const uViewType &u2, - const tauViewType &tau, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const uViewType &u2, const tauViewType &tau, const AViewType const wViewType &w); }; diff --git a/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp index fb9bef60ae..2aa00bf8c2 100644 --- a/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp @@ -28,13 +28,10 @@ namespace KokkosBatched { template struct TeamVectorApplyPivot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int piv, const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int piv, const AViewType &A); template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const PivViewType piv, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const PivViewType piv, const AViewType &A); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp index 177c338a98..7f78e31700 100644 --- a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp @@ -28,11 +28,8 @@ namespace KokkosBatched { template struct SerialApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const tViewType &t, - const BViewType &B, + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w); }; @@ -40,56 +37,39 @@ struct SerialApplyQ { /// Team ApplyQ /// -template +template struct TeamApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w); }; /// /// TeamVector ApplyQ /// -template +template struct TeamVectorApplyQ { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w); }; /// /// Selective Interface /// -template +template struct ApplyQ { - template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const BViewType &B, - const wViewType &w) { + template + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const BViewType &B, const wViewType &w) { int r_val = 0; if (std::is_same::value) { r_val = SerialApplyQ::invoke(A, t, B, w); } else if (std::is_same::value) { - r_val = TeamApplyQ::invoke( - member, A, t, B, w); + r_val = TeamApplyQ::invoke(member, A, t, B, w); } else if (std::is_same::value) { - r_val = TeamVectorApplyQ::invoke( - member, A, t, B, w); + r_val = TeamVectorApplyQ::invoke(member, A, t, B, w); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_Axpy.hpp b/batched/dense/src/KokkosBatched_Axpy.hpp index b76772f3b2..5b89c0862e 100644 --- a/batched/dense/src/KokkosBatched_Axpy.hpp +++ b/batched/dense/src/KokkosBatched_Axpy.hpp @@ -44,9 +44,7 @@ namespace KokkosBatched { struct SerialAxpy { template - KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, - const XViewType &X, - const YViewType &Y); + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, const XViewType &X, const YViewType &Y); }; /// \brief Team Batched AXPY: @@ -72,9 +70,7 @@ struct SerialAxpy { template struct TeamAxpy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const XViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const XViewType &X, const YViewType &Y); }; @@ -102,9 +98,7 @@ struct TeamAxpy { template struct TeamVectorAxpy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const XViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const XViewType &X, const YViewType &Y); }; diff --git a/batched/dense/src/KokkosBatched_Copy_Decl.hpp b/batched/dense/src/KokkosBatched_Copy_Decl.hpp index 07e6ea42da..0e2b24e91d 100644 --- a/batched/dense/src/KokkosBatched_Copy_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Copy_Decl.hpp @@ -29,46 +29,36 @@ namespace KokkosBatched { template struct SerialCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const BViewType &B); }; /// /// Team Copy /// -template +template struct TeamCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B); }; /// /// TeamVector Copy /// -template +template struct TeamVectorCopy { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B); }; /// /// Selective Interface /// -template +template struct Copy { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { int r_val = 0; if (std::is_same::value) { r_val = SerialCopy::invoke(A, B); @@ -85,29 +75,23 @@ struct Copy { #include "KokkosBatched_Copy_Impl.hpp" -#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE( \ - M, N, A, AS0, AS1, B, BS0, BS1) \ +#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(M, N, A, AS0, AS1, B, BS0, BS1) \ KokkosBatched::SerialCopyInternal ::invoke(M, N, A, AS0, AS1, B, BS0, BS1) -#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MEMBER, M, N, A, AS0, AS1, B, BS0, BS1) \ - KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, \ - BS1) +#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, N, A, AS0, AS1, B, BS0, BS1) \ + KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, BS1) #define KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS) \ KokkosBatched::SerialCopyInternal ::invoke(M, A, AS, B, BS) -#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MEMBER, M, A, AS, B, BS) \ +#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, A, AS, B, BS) \ KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, A, AS, B, BS) -#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, MEMBER, M, A, AS, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, A, \ - AS, B, BS); \ +#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, MEMBER, M, A, AS, B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, A, AS, B, BS); \ } #endif diff --git a/batched/dense/src/KokkosBatched_Dot.hpp b/batched/dense/src/KokkosBatched_Dot.hpp index c04914e220..545a4954ce 100644 --- a/batched/dense/src/KokkosBatched_Dot.hpp +++ b/batched/dense/src/KokkosBatched_Dot.hpp @@ -52,9 +52,7 @@ namespace KokkosBatched { template struct SerialDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const NormViewType &dot); + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const NormViewType &dot); }; /// \brief Team Batched DOT: @@ -86,9 +84,7 @@ struct SerialDot { template struct TeamDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot); }; @@ -122,9 +118,7 @@ struct TeamDot { template struct TeamVectorDot { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const NormViewType &dot); }; diff --git a/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp b/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp index 4ba24d519b..39ead9e26c 100644 --- a/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp @@ -49,21 +49,17 @@ namespace KokkosBatched { /// dimension of matrix A. struct SerialEigendecomposition { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const AViewType &A, const EViewType &er, const EViewType &ei, - const UViewType &UL, const UViewType &UR, const WViewType &W); + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const EViewType &er, const EViewType &ei, + const UViewType &UL, const UViewType &UR, const WViewType &W); }; template struct TeamVectorEigendecomposition { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const AViewType &A, const EViewType &er, - const EViewType &ei, const UViewType &UL, const UViewType &UR, - const WViewType &W); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const EViewType &er, + const EViewType &ei, const UViewType &UL, const UViewType &UR, + const WViewType &W); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp index 1febcaa771..9f4b745561 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp @@ -25,61 +25,46 @@ namespace KokkosBatched { template struct SerialGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B, - const ScalarType beta, - const CViewType &C); + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B, + const ScalarType beta, const CViewType &C); }; /// /// Team Gemm /// -template +template struct TeamGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C); }; /// /// TeamVector Gemm /// -template +template struct TeamVectorGemm { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C); }; /// /// Selective Interface /// -template +template struct Gemm { - template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, const CViewType &C) { + template + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { int r_val = 0; if (std::is_same::value) { - r_val = SerialGemm::invoke(alpha, A, B, - beta, C); + r_val = SerialGemm::invoke(alpha, A, B, beta, C); } else if (std::is_same::value) { - r_val = TeamGemm::invoke( - member, alpha, A, B, beta, C); + r_val = TeamGemm::invoke(member, alpha, A, B, beta, C); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_Gemv_Decl.hpp b/batched/dense/src/KokkosBatched_Gemv_Decl.hpp index 825efa9dc5..9ab86d9e07 100644 --- a/batched/dense/src/KokkosBatched_Gemv_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemv_Decl.hpp @@ -29,13 +29,9 @@ namespace KokkosBatched { template struct SerialGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, - const yViewType & /*y*/) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, const AViewType & /*A*/, const xViewType & /*x*/, + const ScalarType /*beta*/, const yViewType & /*y*/) { Kokkos::abort( "Error: KokkosBatched::SerialGemv has been deprecated - use " "KokkosBlas::SerialGemv instead"); @@ -49,13 +45,9 @@ struct SerialGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { assert(false && "Error: encounter dummy impl"); return 0; @@ -68,13 +60,9 @@ struct TeamGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { assert(false && "Error: encounter dummy impl"); return 0; @@ -84,23 +72,18 @@ struct TeamVectorGemv { /// /// Selective Interface /// -template +template struct Gemv { - template - KOKKOS_FORCEINLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { + template + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, + const xViewType &x, const ScalarType beta, const yViewType &y) { int r_val = 0; if (std::is_same::value) { r_val = SerialGemv::invoke(alpha, A, x, beta, y); } else if (std::is_same::value) { - r_val = TeamGemv::invoke(member, alpha, A, - x, beta, y); + r_val = TeamGemv::invoke(member, alpha, A, x, beta, y); } else if (std::is_same::value) { - r_val = TeamVectorGemv::invoke( - member, alpha, A, x, beta, y); + r_val = TeamVectorGemv::invoke(member, alpha, A, x, beta, y); } return r_val; } @@ -112,44 +95,35 @@ struct Gemv { #include "KokkosBatched_Gemv_TeamVector_Impl.hpp" #include "KokkosBlas2_serial_gemv_internal.hpp" -#define KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::SerialGemvInternal::invoke( \ - M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::SerialGemvInternal::invoke( \ - N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::TeamGemvInternal::invoke( \ - MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - KokkosBlas::Impl::TeamGemvInternal::invoke( \ - MEMBER, N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) - -#define KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ +#define KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ + KokkosBlas::Impl::SerialGemvInternal::invoke(M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ + KokkosBlas::Impl::SerialGemvInternal::invoke(N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, \ + Y, YS) \ + KokkosBlas::Impl::TeamGemvInternal::invoke(MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, \ + YS) \ + KokkosBlas::Impl::TeamGemvInternal::invoke(MEMBER, N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS) + +#define KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, \ + BETA, Y, YS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, \ + YS); \ } -#define KOKKOSBATCHED_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ +#define KOKKOSBATCHED_GEMV_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, \ + BETA, Y, YS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS); \ } #endif diff --git a/batched/dense/src/KokkosBatched_Gesv.hpp b/batched/dense/src/KokkosBatched_Gesv.hpp index c4821db459..77922e4da0 100644 --- a/batched/dense/src/KokkosBatched_Gesv.hpp +++ b/batched/dense/src/KokkosBatched_Gesv.hpp @@ -64,15 +64,12 @@ struct Gesv { template struct SerialGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const XVectorType X, - const YVectorType Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const XVectorType X, const YVectorType Y, const MatrixType tmp); template - [[deprecated]] KOKKOS_INLINE_FUNCTION static int invoke( - const MatrixType A, const VectorType X, const VectorType Y, - const MatrixType tmp) { + [[deprecated]] KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, const VectorType X, const VectorType Y, + const MatrixType tmp) { return invoke(A, X, Y, tmp); } }; @@ -109,9 +106,7 @@ struct SerialGesv { template struct TeamGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y); }; @@ -148,9 +143,7 @@ struct TeamGesv { template struct TeamVectorGesv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const MatrixType A, - const VectorType X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const MatrixType A, const VectorType X, const VectorType Y); }; diff --git a/batched/dense/src/KokkosBatched_HadamardProduct.hpp b/batched/dense/src/KokkosBatched_HadamardProduct.hpp index fadd4b5774..f21aa8bae2 100644 --- a/batched/dense/src/KokkosBatched_HadamardProduct.hpp +++ b/batched/dense/src/KokkosBatched_HadamardProduct.hpp @@ -42,9 +42,7 @@ namespace KokkosBatched { struct SerialHadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, - const YViewType &Y, - const VViewType &V); + KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X, const YViewType &Y, const VViewType &V); }; /// \brief Team Batched Hadamard Product: @@ -68,9 +66,7 @@ struct SerialHadamardProduct { template struct TeamHadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const VViewType &V); }; @@ -96,31 +92,22 @@ struct TeamHadamardProduct { template struct TeamVectorHadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const VViewType &V); }; template struct HadamardProduct { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const XViewType &X, - const YViewType &Y, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const XViewType &X, const YViewType &Y, const VViewType &V) { int r_val = 0; if (std::is_same::value) { - r_val = SerialHadamardProduct::template invoke(X, Y, V); + r_val = SerialHadamardProduct::template invoke(X, Y, V); } else if (std::is_same::value) { - r_val = - TeamHadamardProduct::template invoke(member, X, - Y, V); + r_val = TeamHadamardProduct::template invoke(member, X, Y, V); } else if (std::is_same::value) { - r_val = TeamVectorHadamardProduct::template invoke< - XViewType, YViewType, VViewType>(member, X, Y, V); + r_val = TeamVectorHadamardProduct::template invoke(member, X, Y, V); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 4725e0220d..0741b5b41e 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -82,32 +82,23 @@ namespace KokkosBatched { /// BatchedGemm(handle, alpha, A, B, beta, C); // clang-format on -template -inline int BatchedGemm(BatchedGemmHandleType *const handle, - const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, - const CViewType &C) { +template +inline int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, const CViewType &C) { // Minimize the number of ImplBatchedGemmWrapper instantiations, by // standardizing on particular View specializations for its template // parameters. - using UnifiedAVT = Kokkos::View< - typename AViewType::value_type ***, typename AViewType::array_layout, - typename AViewType::device_type, Kokkos::MemoryTraits>; - using UnifiedBVT = Kokkos::View< - typename BViewType::value_type ***, typename BViewType::array_layout, - typename BViewType::device_type, Kokkos::MemoryTraits>; - using UnifiedCVT = Kokkos::View>; + using UnifiedAVT = Kokkos::View>; + using UnifiedBVT = Kokkos::View>; + using UnifiedCVT = Kokkos::View>; // Go through specialization layer in case ETI'd symbols are available. - return Impl::BatchedGemmSpec::run(handle, alpha, A, B, - beta, C); + return Impl::BatchedGemmSpec::run(handle, alpha, A, B, beta, C); } } // namespace KokkosBatched #endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp index 95e8f36bc2..2aa6f47cb0 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp @@ -40,15 +40,11 @@ enum GEMM_KOKKOS_BATCHED_ALGOS : int { }; } -#define GEMM_ALGO_STRS \ - "GemmTplAlgos::CUBLAS", "GemmTplAlgos::MAGMA", \ - "GemmKokkosBatchedAlgos::KK_TEAM", \ - "GemmKokkosBatchedAlgos::KK_TEAMVECTOR", \ - "GemmKokkosBatchedAlgos::KK_SERIALSIMD", \ - "GemmKokkosBatchedAlgos::KK_TEAMSIMD", \ - "GemmKokkosBatchedAlgos::KK_SERIAL_RANK0", \ - "GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM", \ - "GemmKokkosBatchedAlgos::KK_DBLBUF" +#define GEMM_ALGO_STRS \ + "GemmTplAlgos::CUBLAS", "GemmTplAlgos::MAGMA", "GemmKokkosBatchedAlgos::KK_TEAM", \ + "GemmKokkosBatchedAlgos::KK_TEAMVECTOR", "GemmKokkosBatchedAlgos::KK_SERIALSIMD", \ + "GemmKokkosBatchedAlgos::KK_TEAMSIMD", "GemmKokkosBatchedAlgos::KK_SERIAL_RANK0", \ + "GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM", "GemmKokkosBatchedAlgos::KK_DBLBUF" // clang-format off /// \brief Handle for selecting runtime behavior of the BatchedGemm interface. /// @@ -96,8 +92,7 @@ enum GEMM_KOKKOS_BATCHED_ALGOS : int { // clang-format on class BatchedGemmHandle : public BatchedKernelHandle { public: - BatchedGemmHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, int vecLength = 0) : BatchedKernelHandle(kernelAlgoType, teamSize, vecLength) { #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) if (!_tplParamsSet && kernelAlgoType == GemmTplAlgos::CUBLAS) { @@ -116,26 +111,23 @@ class BatchedGemmHandle : public BatchedKernelHandle { #endif // MAGMA }; - BatchedGemmHandle(bool tplParamsSet, - int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(bool tplParamsSet, int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, + int vecLength = 0) : BatchedKernelHandle(kernelAlgoType, teamSize, vecLength) { _tplParamsSet = tplParamsSet; }; #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) - BatchedGemmHandle(cublasHandle_t &cublas_handle, - int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(cublasHandle_t &cublas_handle, int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, + int vecLength = 0) : BatchedGemmHandle(true, kernelAlgoType, teamSize, vecLength) { _tplParamsSingleton.cublas_handle = &cublas_handle; }; #endif // CUBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) - BatchedGemmHandle(magma_queue_t &magma_queue, - int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedGemmHandle(magma_queue_t &magma_queue, int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, + int vecLength = 0) : BatchedGemmHandle(true, kernelAlgoType, teamSize, vecLength) { _tplParamsSingleton.magma_queue = &magma_queue; }; @@ -151,13 +143,10 @@ class BatchedGemmHandle : public BatchedKernelHandle { #endif } - std::string get_kernel_algo_type_str() const { - return gemm_algo_type_strs[_kernelAlgoType]; - } + std::string get_kernel_algo_type_str() const { return gemm_algo_type_strs[_kernelAlgoType]; } private: - const char *gemm_algo_type_strs[GemmKokkosBatchedAlgos::N] = {BASE_ALGO_STRS, - GEMM_ALGO_STRS}; + const char *gemm_algo_type_strs[GemmKokkosBatchedAlgos::N] = {BASE_ALGO_STRS, GEMM_ALGO_STRS}; }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Householder_Decl.hpp b/batched/dense/src/KokkosBatched_Householder_Decl.hpp index 6d749bd73a..0a48457551 100644 --- a/batched/dense/src/KokkosBatched_Householder_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Householder_Decl.hpp @@ -30,8 +30,7 @@ namespace KokkosBatched { template struct SerialHouseholder { template - KOKKOS_INLINE_FUNCTION static int invoke(const aViewType &a, - const tauViewType &tau); + KOKKOS_INLINE_FUNCTION static int invoke(const aViewType &a, const tauViewType &tau); }; /// @@ -42,9 +41,7 @@ struct SerialHouseholder { template struct TeamVectorHouseholder { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const aViewType &a, - const tauViewType &tau); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const aViewType &a, const tauViewType &tau); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp index 90f2cdb643..757a92ca21 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp @@ -25,25 +25,19 @@ struct InnerGemmFixA { const int _as0, _as1, _bs0, _bs1, _cs0, _cs1; KOKKOS_INLINE_FUNCTION - InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, - const int cs0, const int cs1) + InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, const int cs0, const int cs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {} // serial rank update template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp index 67d968a356..b2f885970f 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp @@ -25,25 +25,19 @@ struct InnerGemmFixB { const int _as0, _as1, _bs0, _bs1, _cs0, _cs1; KOKKOS_INLINE_FUNCTION - InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, - const int cs0, const int cs1) + InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1, const int cs0, const int cs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {} // serial rank update template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int n, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp index 64d00845ee..c61d966f77 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp @@ -25,49 +25,37 @@ struct InnerGemmFixC { const int _as0, _as1, _bs0, _bs1, _cs0, _cs1; KOKKOS_INLINE_FUNCTION - InnerGemmFixC(const int as0, const int as1, const int bs0, const int bs1, - const int cs0, const int cs1) + InnerGemmFixC(const int as0, const int as1, const int bs0, const int bs1, const int cs0, const int cs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {} // serial rank update template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int k, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int k, /**/ ValueType *KOKKOS_RESTRICT C); // serial rank update for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, - const int m, const int n, - const int k, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, + const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); template - KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, + KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, const int k, /**/ ValueType *KOKKOS_RESTRICT C); // team rank update for remainder template - KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, - const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, - const ValueType *KOKKOS_RESTRICT B, + KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k, /**/ ValueType *KOKKOS_RESTRICT C); }; diff --git a/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp b/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp index d0d50a146c..c355185b74 100644 --- a/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp @@ -33,13 +33,11 @@ struct InnerLU { // for remainder square template - KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, - ValueType *KOKKOS_RESTRICT A); + KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, ValueType *KOKKOS_RESTRICT A); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, const int n, - ValueType *KOKKOS_RESTRICT A); + KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, const int n, ValueType *KOKKOS_RESTRICT A); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp b/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp index 22395c9201..5b5b9bb147 100644 --- a/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp @@ -27,20 +27,17 @@ struct InnerTrsmLeftLowerUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftLowerUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftLowerUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; @@ -51,20 +48,17 @@ struct InnerTrsmLeftLowerNonUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftLowerNonUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftLowerNonUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; @@ -75,20 +69,17 @@ struct InnerTrsmLeftUpperUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftUpperUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftUpperUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; @@ -99,20 +90,17 @@ struct InnerTrsmLeftUpperNonUnitDiag { const int _as0, _as1, _bs0, _bs1; KOKKOS_INLINE_FUNCTION - InnerTrsmLeftUpperNonUnitDiag(const int as0, const int as1, const int bs0, - const int bs1) + InnerTrsmLeftUpperNonUnitDiag(const int as0, const int as1, const int bs0, const int bs1) : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {} // trisolve template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int n, /**/ ValueType *KOKKOS_RESTRICT B); // for remainder template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, - const int m, const int n, + KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A, const int m, const int n, /**/ ValueType *KOKKOS_RESTRICT B); }; diff --git a/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp b/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp index e28a0151ed..930bc790b0 100644 --- a/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp @@ -30,12 +30,10 @@ namespace KokkosBatched { template struct SerialInverseLU { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const wViewType &w) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const wViewType &w) { typedef typename wViewType::value_type value_type; // workspace w is always 1D view; reinterpret it - Kokkos::View W( - w.data(), A.extent(0), A.extent(1)); + Kokkos::View W(w.data(), A.extent(0), A.extent(1)); int r_val[3] = {}; r_val[0] = SerialCopy::invoke(A, W); @@ -48,19 +46,15 @@ struct SerialInverseLU { template struct TeamInverseLU { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const wViewType &w) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const wViewType &w) { typedef typename wViewType::value_type value_type; // workspace w is always 1D view; reinterpret it - Kokkos::View W( - w.data(), A.extent(0), A.extent(1)); + Kokkos::View W(w.data(), A.extent(0), A.extent(1)); int r_val[3] = {}; - r_val[0] = TeamCopy::invoke(member, A, W); - r_val[1] = TeamSetIdentity::invoke(member, A); - r_val[2] = TeamSolveLU::invoke( - member, W, A); + r_val[0] = TeamCopy::invoke(member, A, W); + r_val[1] = TeamSetIdentity::invoke(member, A); + r_val[2] = TeamSolveLU::invoke(member, W, A); return r_val[0] + r_val[1] + r_val[2]; } }; diff --git a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp index 051f78979d..bd73b4e267 100644 --- a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp +++ b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp @@ -56,10 +56,9 @@ enum BASE_KOKKOS_BATCHED_ALGOS : int { KK_SERIAL = BaseTplAlgos::N, N }; } #define N_BASE_ALGOS BaseKokkosBatchedAlgos::N -#define BASE_ALGO_STRS \ - "BaseHeuristicAlgos::SQUARE", "BaseHeuristicAlgos::TALL", \ - "BaseHeuristicAlgos::WIDE", "BaseTplAlgos::ARMPL", "BaseTplAlgosMKL", \ - "BaseKokkosBatchedAlgos::KK_SERIAL" +#define BASE_ALGO_STRS \ + "BaseHeuristicAlgos::SQUARE", "BaseHeuristicAlgos::TALL", "BaseHeuristicAlgos::WIDE", "BaseTplAlgos::ARMPL", \ + "BaseTplAlgosMKL", "BaseKokkosBatchedAlgos::KK_SERIAL" /// \brief TplParams abstracts underlying handle or execution queue type. struct TplParams { @@ -145,8 +144,7 @@ class BatchedKernelHandle { int vecLen = 0; bool enableDebug = false; - BatchedKernelHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, - int teamSize = 0, int vecLength = 0) + BatchedKernelHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE, int teamSize = 0, int vecLength = 0) : teamSz(teamSize), vecLen(vecLength), _kernelAlgoType(kernelAlgoType) { #if !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) || ARMPL_BUILD < 1058 if (_kernelAlgoType == BaseTplAlgos::ARMPL) { @@ -161,9 +159,7 @@ class BatchedKernelHandle { int get_kernel_algo_type() const { return _kernelAlgoType; } - std::string get_kernel_algo_type_str() const { - return algo_type_strs[_kernelAlgoType]; - } + std::string get_kernel_algo_type_str() const { return algo_type_strs[_kernelAlgoType]; } decltype(auto) get_tpl_params() const { #if _kernelAlgoType == ARMPL && defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) diff --git a/batched/dense/src/KokkosBatched_LU_Decl.hpp b/batched/dense/src/KokkosBatched_LU_Decl.hpp index fcba6e20f8..363193c147 100644 --- a/batched/dense/src/KokkosBatched_LU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_LU_Decl.hpp @@ -28,9 +28,7 @@ struct SerialLU { // no piv version template KOKKOS_INLINE_FUNCTION static int invoke( - const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0); + const AViewType &A, const typename MagnitudeScalarType::type tiny = 0); }; template @@ -39,8 +37,7 @@ struct TeamLU { template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0); + const typename MagnitudeScalarType::type tiny = 0); }; /// @@ -52,8 +49,7 @@ struct LU { template KOKKOS_FORCEINLINE_FUNCTION static int invoke( const MemberType &member, const AViewType &A, - const typename MagnitudeScalarType< - typename AViewType::non_const_value_type>::type tiny = 0) { + const typename MagnitudeScalarType::type tiny = 0) { int r_val = 0; if (std::is_same::value) { r_val = SerialLU::invoke(A, tiny); diff --git a/batched/dense/src/KokkosBatched_Pttrf.hpp b/batched/dense/src/KokkosBatched_Pttrf.hpp index 4fcc944dc8..787e5aeee3 100644 --- a/batched/dense/src/KokkosBatched_Pttrf.hpp +++ b/batched/dense/src/KokkosBatched_Pttrf.hpp @@ -41,8 +41,7 @@ namespace KokkosBatched { template struct SerialPttrf { template - KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, - const EViewType &e); + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_QR_Decl.hpp b/batched/dense/src/KokkosBatched_QR_Decl.hpp index 993e9345fb..78bdcd4d4b 100644 --- a/batched/dense/src/KokkosBatched_QR_Decl.hpp +++ b/batched/dense/src/KokkosBatched_QR_Decl.hpp @@ -29,9 +29,7 @@ namespace KokkosBatched { template struct SerialQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const tViewType &t, - const wViewType &w); + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const tViewType &t, const wViewType &w); }; /// @@ -41,10 +39,8 @@ struct SerialQR { template struct TeamQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const AViewType & /*A*/, - const tViewType & /*t*/, - const wViewType & /*w*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const AViewType & /*A*/, + const tViewType & /*t*/, const wViewType & /*w*/) { /// not implemented return -1; } @@ -57,9 +53,7 @@ struct TeamQR { template struct TeamVectorQR { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const wViewType &w); }; @@ -69,9 +63,7 @@ struct TeamVectorQR { template struct QR { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const wViewType &w) { int r_val = 0; if (std::is_same::value) { diff --git a/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp b/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp index 134a97ed73..b08e5277a0 100644 --- a/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp +++ b/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp @@ -28,13 +28,9 @@ namespace KokkosBatched { template struct TeamVectorQR_WithColumnPivoting { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const tViewType &t, - const pViewType &p, - const wViewType &w, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, + const pViewType &p, const wViewType &w, /* */ int &matrix_rank); }; diff --git a/batched/dense/src/KokkosBatched_SVD_Decl.hpp b/batched/dense/src/KokkosBatched_SVD_Decl.hpp index e84008cb69..efade8029b 100644 --- a/batched/dense/src/KokkosBatched_SVD_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SVD_Decl.hpp @@ -56,20 +56,16 @@ struct SVD_S_Tag {}; struct SerialSVD { // Version to compute full factorization: A == U * diag(s) * Vt - template + template KOKKOS_INLINE_FUNCTION static int invoke( - SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &s, - const VtViewType &Vt, const WViewType &W, - typename AViewType::const_value_type tol = - Kokkos::ArithTraits::zero()); + SVD_USV_Tag, const AViewType &A, const UViewType &U, const SViewType &s, const VtViewType &Vt, const WViewType &W, + typename AViewType::const_value_type tol = Kokkos::ArithTraits::zero()); // Version which computes only singular values template KOKKOS_INLINE_FUNCTION static int invoke( SVD_S_Tag, const AViewType &A, const SViewType &s, const WViewType &W, - typename AViewType::const_value_type tol = - Kokkos::ArithTraits::zero()); + typename AViewType::const_value_type tol = Kokkos::ArithTraits::zero()); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Scale_Decl.hpp b/batched/dense/src/KokkosBatched_Scale_Decl.hpp index dbb9a43ffb..94453a5ede 100644 --- a/batched/dense/src/KokkosBatched_Scale_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Scale_Decl.hpp @@ -26,49 +26,45 @@ namespace KokkosBatched { /// Serial Scale /// -struct [[deprecated]] SerialScale{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] SerialScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::SerialScale is deprecated: use KokkosBlas::SerialScale " "instead"); -return 0; -} // namespace KokkosBatched -} -; + return 0; + } // namespace KokkosBatched +}; /// /// Team Scale /// template -struct [[deprecated]] TeamScale{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] TeamScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::TeamScale is deprecated: use KokkosBlas::TeamScale " "instead"); -return 0; -} -} -; + return 0; + } +}; /// /// TeamVector Scale /// template -struct [[deprecated]] TeamVectorScale{ - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A){ - Kokkos::abort("KokkosBatched::TeamVectorScale is deprecated: use " - "KokkosBlas::TeamVectorScale instead"); -return 0; -} -} -; +struct [[deprecated]] TeamVectorScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( + "KokkosBatched::TeamVectorScale is deprecated: use " + "KokkosBlas::TeamVectorScale instead"); + return 0; + } +}; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp b/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp index b78d3e7b05..27c2b22ed7 100644 --- a/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp @@ -39,8 +39,7 @@ struct SerialSetIdentity { template struct TeamSetIdentity { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A); }; /// @@ -49,8 +48,7 @@ struct TeamSetIdentity { template struct SetIdentity { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A) { int r_val = 0; if (std::is_same::value) { r_val = SerialSetIdentity::invoke(A); diff --git a/batched/dense/src/KokkosBatched_Set_Decl.hpp b/batched/dense/src/KokkosBatched_Set_Decl.hpp index ebddb72a4a..d33d186275 100644 --- a/batched/dense/src/KokkosBatched_Set_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Set_Decl.hpp @@ -25,49 +25,45 @@ namespace KokkosBatched { /// Serial Set /// -struct [[deprecated]] SerialSet{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] SerialSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::SerialSet is deprecated: use KokkosBlas::SerialSet " "instead"); -return 0; -} // namespace KokkosBatched -} -; + return 0; + } // namespace KokkosBatched +}; /// /// Team Set /// template -struct [[deprecated]] TeamSet{ - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A){Kokkos::abort( +struct [[deprecated]] TeamSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( "KokkosBatched::TeamSet is deprecated: use KokkosBlas::TeamSet " "instead"); -return 0; -} -} -; + return 0; + } +}; /// /// TeamVector Set /// template -struct [[deprecated]] TeamVectorSet{ - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A){ - Kokkos::abort("KokkosBatched::TeamVectorSet is deprecated: use " - "KokkosBlas::TeamVectorSet instead"); -return 0; -} -} -; +struct [[deprecated]] TeamVectorSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + Kokkos::abort( + "KokkosBatched::TeamVectorSet is deprecated: use " + "KokkosBlas::TeamVectorSet instead"); + return 0; + } +}; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp b/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp index 8e731e2666..119f5c6916 100644 --- a/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp @@ -30,25 +30,19 @@ template struct SerialSolveLU { // no piv version template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const BViewType &B) { int r_val[2] = {}; const typename AViewType::non_const_value_type one(1.0); if (std::is_same::value) { // First, compute Y (= U*X) by solving the system L*Y = B for Y - r_val[0] = SerialTrsm::invoke(one, A, B); + r_val[0] = SerialTrsm::invoke(one, A, B); // Second, compute X by solving the system U*X = Y for X - r_val[1] = SerialTrsm::invoke(one, A, B); - } else if (std::is_same::value || - std::is_same::value) { + r_val[1] = SerialTrsm::invoke(one, A, B); + } else if (std::is_same::value || std::is_same::value) { // First, compute Y (= L'*X) by solving the system U'*Y = B for Y - r_val[0] = SerialTrsm::invoke(one, A, B); + r_val[0] = SerialTrsm::invoke(one, A, B); // Second, compute X by solving the system L'*X = Y for X - r_val[1] = SerialTrsm::invoke(one, A, B); + r_val[1] = SerialTrsm::invoke(one, A, B); } return r_val[0] + r_val[1]; } @@ -58,26 +52,23 @@ template struct TeamSolveLU { // no piv version template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { int r_val[2] = {}; const typename AViewType::non_const_value_type one(1.0); if (std::is_same::value) { // First, compute Y (= U*X) by solving the system L*Y = B for Y - r_val[0] = TeamTrsm::invoke(member, one, A, B); + r_val[0] = + TeamTrsm::invoke(member, one, A, B); // Second, compute X by solving the system U*X = Y for X - r_val[1] = TeamTrsm::invoke(member, one, A, B); - } else if (std::is_same::value || - std::is_same::value) { + r_val[1] = + TeamTrsm::invoke(member, one, A, B); + } else if (std::is_same::value || std::is_same::value) { // First, compute Y (= L'*X) by solving the system U'*Y = B for Y - r_val[0] = TeamTrsm::invoke(member, one, A, B); + r_val[0] = + TeamTrsm::invoke(member, one, A, B); // Second, compute X by solving the system L'*X = Y for X - r_val[1] = TeamTrsm::invoke(member, one, A, B); + r_val[1] = + TeamTrsm::invoke(member, one, A, B); } return r_val[0] + r_val[1]; } @@ -86,14 +77,11 @@ struct TeamSolveLU { /// /// Selective Interface /// -template +template struct SolveLU { // no piv version template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const AViewType &A, - const BViewType &B) { + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { int r_val = 0; if (std::is_same::value) { r_val = SerialSolveLU::invoke(A, B); diff --git a/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp b/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp index e55836de6c..c881a0b0f7 100644 --- a/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp @@ -46,13 +46,11 @@ namespace KokkosBatched { template struct TeamVectorSolveUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int matrix_rank, const UViewType &U, - const TViewType &T, const VViewType &V, const pViewType &p, - const XViewType &X, const BViewType &B, const wViewType &w); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int matrix_rank, const UViewType &U, + const TViewType &T, const VViewType &V, const pViewType &p, + const XViewType &X, const BViewType &B, const wViewType &w); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Tbsv.hpp b/batched/dense/src/KokkosBatched_Tbsv.hpp index 7510c07969..f7d700be44 100644 --- a/batched/dense/src/KokkosBatched_Tbsv.hpp +++ b/batched/dense/src/KokkosBatched_Tbsv.hpp @@ -41,12 +41,10 @@ namespace KokkosBatched { /// No nested parallel_for is used inside of the function. /// -template +template struct SerialTbsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const XViewType &X, const int k); + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &X, const int k); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Trmm_Decl.hpp b/batched/dense/src/KokkosBatched_Trmm_Decl.hpp index 81d1f8d073..c284ed63b2 100644 --- a/batched/dense/src/KokkosBatched_Trmm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trmm_Decl.hpp @@ -22,13 +22,10 @@ namespace KokkosBatched { -template +template struct SerialTrmm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B); }; } // namespace KokkosBatched #endif // __KOKKOSBATCHED_TRMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_Trsm_Decl.hpp b/batched/dense/src/KokkosBatched_Trsm_Decl.hpp index e0aee4659f..d2220953cc 100644 --- a/batched/dense/src/KokkosBatched_Trsm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trsm_Decl.hpp @@ -23,54 +23,42 @@ namespace KokkosBatched { -template +template struct SerialTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A, - const BViewType &B); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B); }; -template +template struct TeamTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B); }; -template +template struct TeamVectorTrsm { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B); }; /// /// Selective Interface /// -template +template struct Trsm { template - KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const BViewType &B) { int r_val = 0; if (std::is_same::value) { - r_val = SerialTrsm::invoke( - alpha, A, B); + r_val = SerialTrsm::invoke(alpha, A, B); } else if (std::is_same::value) { - r_val = TeamTrsm::invoke(member, alpha, A, B); + r_val = TeamTrsm::invoke(member, alpha, A, B); } return r_val; } diff --git a/batched/dense/src/KokkosBatched_Trsv_Decl.hpp b/batched/dense/src/KokkosBatched_Trsv_Decl.hpp index ed9f5cca26..e3da43a95d 100644 --- a/batched/dense/src/KokkosBatched_Trsv_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trsv_Decl.hpp @@ -27,12 +27,10 @@ namespace KokkosBatched { /// Serial Trsv /// -template +template struct SerialTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, - const AViewType & /*A*/, + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, const AViewType & /*A*/, const bViewType & /*b*/) { assert(false && "Error: encounter dummy impl"); return 0; @@ -43,14 +41,11 @@ struct SerialTrsv { /// Team Trsv /// -template +template struct TeamTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const bViewType & /*b*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const bViewType & /*b*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -60,14 +55,11 @@ struct TeamTrsv { /// TeamVector Trsv /// -template +template struct TeamVectorTrsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, - const ScalarType /*alpha*/, - const AViewType & /*A*/, - const bViewType & /*b*/) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, const ScalarType /*alpha*/, + const AViewType & /*A*/, const bViewType & /*b*/) { assert(false && "Error: encounter dummy impl"); return 0; } @@ -76,24 +68,19 @@ struct TeamVectorTrsv { /// /// Selective Interface /// -template +template struct Trsv { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A, const bViewType &b) { int r_val = 0; if (std::is_same::value) { - r_val = - SerialTrsv::invoke(alpha, A, b); + r_val = SerialTrsv::invoke(alpha, A, b); } else if (std::is_same::value) { - r_val = TeamTrsv::invoke( - member, alpha, A, b); + r_val = TeamTrsv::invoke(member, alpha, A, b); } else if (std::is_same::value) { - r_val = TeamVectorTrsv::invoke(member, alpha, A, b); + r_val = TeamVectorTrsv::invoke(member, alpha, A, b); } return r_val; } @@ -105,116 +92,98 @@ struct Trsv { #include "KokkosBatched_Trsv_Team_Impl.hpp" #include "KokkosBatched_Trsv_TeamVector_Impl.hpp" -#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalLower::invoke( \ - DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalUpper::invoke( \ - DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalUpper::invoke( \ - DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::SerialTrsvInternalLower::invoke( \ - DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalUpper::invoke( \ - MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) - -#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - KokkosBatched::TeamVectorTrsvInternalLower::invoke( \ - MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) - -#define KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalLower::invoke(DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalUpper::invoke(DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalUpper::invoke(DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ + KokkosBatched::SerialTrsvInternalLower::invoke(DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + KokkosBatched::TeamTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS) \ + KokkosBatched::TeamTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + KokkosBatched::TeamTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS) + +#define KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS) \ + KokkosBatched::TeamTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, \ + BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, \ + BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalUpper::invoke(MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, \ + BS) + +#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + KokkosBatched::TeamVectorTrsvInternalLower::invoke(MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, \ + BS) + +#define KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS); \ } -#define KOKKOSBATCHED_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ } -#define KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, \ + AS1, B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS); \ } -#define KOKKOSBATCHED_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS) \ - if (std::is_same::value) { \ - KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ - } else if (std::is_same::value) { \ - KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \ - ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ +#define KOKKOSBATCHED_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, \ + B, BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, \ + BS); \ } #endif diff --git a/batched/dense/src/KokkosBatched_UTV_Decl.hpp b/batched/dense/src/KokkosBatched_UTV_Decl.hpp index 792236a14f..bae2780e10 100644 --- a/batched/dense/src/KokkosBatched_UTV_Decl.hpp +++ b/batched/dense/src/KokkosBatched_UTV_Decl.hpp @@ -57,12 +57,10 @@ namespace KokkosBatched { template struct TeamVectorUTV { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const AViewType &A, const pViewType &p, - const UViewType &U, const VViewType &V, const wViewType &w, - int &matrix_rank); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const pViewType &p, + const UViewType &U, const VViewType &V, const wViewType &w, + int &matrix_rank); }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index 71d159cb03..e44af7bc04 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -143,9 +143,7 @@ struct DefaultInternalVectorLength { }; template struct DefaultInternalVectorLength { - enum : int { - value = DefaultVectorLength::value - }; + enum : int { value = DefaultVectorLength::value }; }; #if defined(KOKKOS_ENABLE_CUDA) @@ -174,13 +172,11 @@ struct DefaultInternalVectorLength { enum : int { value = 2 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::CudaUVMSpace> { +struct DefaultInternalVectorLength, Kokkos::CudaUVMSpace> { enum : int { value = 2 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::CudaUVMSpace> { +struct DefaultInternalVectorLength, Kokkos::CudaUVMSpace> { enum : int { value = 1 }; }; #endif @@ -256,18 +252,12 @@ class ArithTraits, l>> { typedef typename ArithTraits::val_type val_scalar_type; typedef typename ArithTraits::mag_type mag_scalar_type; - typedef KokkosBatched::Vector, l> - val_type; - typedef KokkosBatched::Vector, l> - mag_type; + typedef KokkosBatched::Vector, l> val_type; + typedef KokkosBatched::Vector, l> mag_type; - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) { - return val; - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) { return val; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type &val) { - return val; - } + static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type &val) { return val; } static KOKKOS_FORCEINLINE_FUNCTION val_type abs(const val_type &val) { using KAT = ArithTraits; @@ -286,17 +276,13 @@ class ArithTraits, l>> { }; template -class ArithTraits< - KokkosBatched::Vector>, l>> { +class ArithTraits>, l>> { public: typedef typename ArithTraits::val_type val_scalar_type; typedef typename ArithTraits::mag_type mag_scalar_type; - typedef KokkosBatched::Vector< - KokkosBatched::SIMD>, l> - val_type; - typedef KokkosBatched::Vector, l> - mag_type; + typedef KokkosBatched::Vector>, l> val_type; + typedef KokkosBatched::Vector, l> mag_type; static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) { mag_type r_val; diff --git a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index 753904dbb9..52a73deda4 100644 --- a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -63,8 +63,7 @@ class Vector, l> { for (int i = 0; i < vector_length; ++i) _data[i] = val; } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { KOKKOSKERNELS_FORCE_SIMD for (int i = 0; i < vector_length; ++i) _data[i] = b[i]; } @@ -140,8 +139,7 @@ class Vector, 2> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; } @@ -183,9 +181,7 @@ class Vector, 2> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -232,8 +228,7 @@ class Vector, 2> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; } @@ -275,9 +270,7 @@ class Vector, 2> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -334,8 +327,7 @@ class Vector, 4> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; _data.z = b[2]; @@ -389,9 +381,7 @@ class Vector, 4> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -448,8 +438,7 @@ class Vector, 4> { } template - KOKKOS_INLINE_FUNCTION Vector( - const Vector, vector_length> &b) { + KOKKOS_INLINE_FUNCTION Vector(const Vector, vector_length> &b) { _data.x = b[0]; _data.y = b[1]; _data.z = b[2]; @@ -503,9 +492,7 @@ class Vector, 4> { } KOKKOS_INLINE_FUNCTION - value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; } // namespace KokkosBatched @@ -580,13 +567,9 @@ class Vector, 4> { inline void storeAligned(value_type *p) const { _mm256_store_pd(p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm256_storeu_pd(p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm256_storeu_pd(p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -657,17 +640,11 @@ class Vector >, 2> { return *this; } - inline void storeAligned(value_type *p) const { - _mm256_store_pd((mag_type *)p, _data); - } + inline void storeAligned(value_type *p) const { _mm256_store_pd((mag_type *)p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm256_storeu_pd((mag_type *)p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm256_storeu_pd((mag_type *)p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; } // namespace KokkosBatched #endif /* #if defined(__AVX__) || defined(__AVX2__) */ @@ -737,13 +714,9 @@ class Vector, 8> { inline void storeAligned(value_type *p) const { _mm512_store_pd(p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm512_storeu_pd(p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm512_storeu_pd(p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; template <> @@ -767,13 +740,11 @@ class Vector >, 4> { public: inline Vector() { _data = _mm512_setzero_pd(); } inline Vector(const value_type &val) { - _data = _mm512_mask_broadcast_f64x4(_mm512_set1_pd(val.imag()), 0x55, - _mm256_set1_pd(val.real())); + _data = _mm512_mask_broadcast_f64x4(_mm512_set1_pd(val.imag()), 0x55, _mm256_set1_pd(val.real())); KOKKOSKERNELS_GNU_COMPILER_FENCE } inline Vector(const mag_type &val) { - _data = _mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0x55, - _mm256_set1_pd(val)); + _data = _mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0x55, _mm256_set1_pd(val)); KOKKOSKERNELS_GNU_COMPILER_FENCE } inline Vector(const type &b) { _data = b._data; } @@ -810,17 +781,11 @@ class Vector >, 4> { return *this; } - inline void storeAligned(value_type *p) const { - _mm512_store_pd((mag_type *)p, _data); - } + inline void storeAligned(value_type *p) const { _mm512_store_pd((mag_type *)p, _data); } - inline void storeUnaligned(value_type *p) const { - _mm512_storeu_pd((mag_type *)p, _data); - } + inline void storeUnaligned(value_type *p) const { _mm512_storeu_pd((mag_type *)p, _data); } - inline value_type &operator[](const int &i) const { - return reinterpret_cast(&_data)[i]; - } + inline value_type &operator[](const int &i) const { return reinterpret_cast(&_data)[i]; } }; } // namespace KokkosBatched diff --git a/batched/dense/src/KokkosBatched_Xpay.hpp b/batched/dense/src/KokkosBatched_Xpay.hpp index 1e9a08623b..51418fd81a 100644 --- a/batched/dense/src/KokkosBatched_Xpay.hpp +++ b/batched/dense/src/KokkosBatched_Xpay.hpp @@ -44,9 +44,7 @@ namespace KokkosBatched { struct SerialXpay { template - KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, - const ViewType &X, - const ViewType &Y); + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, const ViewType &X, const ViewType &Y); }; /// \brief Team Batched XPAY: @@ -72,9 +70,7 @@ struct SerialXpay { template struct TeamXpay { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const ViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const ViewType &X, const ViewType &Y); }; @@ -102,9 +98,7 @@ struct TeamXpay { template struct TeamVectorXpay { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const alphaViewType &alpha, - const ViewType &X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, const ViewType &X, const ViewType &Y); }; diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 3c00b4f477..6c2c359f00 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -25,14 +25,10 @@ using namespace KokkosBatched; namespace Test { -template -void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, - const int N, const int matAdim1, - const int matAdim2, const int matBdim1, - const int matBdim2, const int matCdim1, - const int matCdim2, ScalarType alpha, - ScalarType beta) { +template +void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, const int N, const int matAdim1, + const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, + const int matCdim2, ScalarType alpha, ScalarType beta) { using execution_space = typename DeviceType::execution_space; using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; @@ -43,15 +39,11 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, auto algo_type = batchedGemmHandle->get_kernel_algo_type(); ViewType a_expected, a_actual, b_expected, b_actual, c_expected, c_actual; std::string fmsg; - std::string fmsg_rhs = - "algo_type:" + batchedGemmHandle->get_kernel_algo_type_str() + ", "; + std::string fmsg_rhs = "algo_type:" + batchedGemmHandle->get_kernel_algo_type_str() + ", "; fmsg_rhs += ("N:" + std::to_string(N) + ", "); - fmsg_rhs += - ("A:" + std::to_string(matAdim1) + "x" + std::to_string(matAdim2) + ", "); - fmsg_rhs += - ("B:" + std::to_string(matBdim1) + "x" + std::to_string(matBdim2) + ", "); - fmsg_rhs += - ("C:" + std::to_string(matCdim1) + "x" + std::to_string(matCdim2) + "\n"); + fmsg_rhs += ("A:" + std::to_string(matAdim1) + "x" + std::to_string(matAdim2) + ", "); + fmsg_rhs += ("B:" + std::to_string(matBdim1) + "x" + std::to_string(matBdim2) + ", "); + fmsg_rhs += ("C:" + std::to_string(matCdim1) + "x" + std::to_string(matCdim2) + "\n"); if (std::is_same::value) { a_expected = ViewType("a_expected", N, matAdim1, matAdim2); @@ -86,10 +78,8 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, // Check for DblBuf runtime errors related to team_size try { fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); - Impl::BatchedDblBufGemm( + Impl::BatchedDblBufGemm( batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual) .invoke(); FAIL() << (fmsg + fmsg_rhs); @@ -100,11 +90,9 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, // Check for DblBuf runtime errors related to vector_len try { fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); - Impl::BatchedDblBufGemm< - transA, transB, batchLayout, BatchedGemmHandle, ScalarType, - decltype(a_actual), decltype(b_actual), decltype(c_actual), - BoundsCheck::No, AlphaTag::No, 65536, 65536 * 2, 65536>( - batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual) + Impl::BatchedDblBufGemm(batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual) .invoke(); FAIL() << (fmsg + fmsg_rhs); } catch (const std::runtime_error& error) { @@ -123,9 +111,8 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, #endif fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); - ret = BatchedGemm( - batchedGemmHandle, alpha, a_actual, b_actual, beta, - c_actual); // Compute c_actual + ret = BatchedGemm(batchedGemmHandle, alpha, a_actual, b_actual, beta, + c_actual); // Compute c_actual } catch (const std::runtime_error& error) { std::string error_msg = error.what(); if (algo_type == BaseHeuristicAlgos::SQUARE && matCdim1 != matCdim2) { @@ -135,8 +122,7 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, auto ninter = batchedGemmHandle->get_tpl_params()[0]; // No runtime errors expected since layout is valid, double is a supported // type, and ninter != 0 - if (std::is_same::value && - ninter != 0) { + if (std::is_same::value && ninter != 0) { FAIL() << (error_msg + fmsg + fmsg_rhs); } #else @@ -149,12 +135,10 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, } ASSERT_EQ(ret, 0) << (fmsg + fmsg_rhs); - Functor_BatchedVanillaGEMM - vgemm; - vgemm.A_t = std::is_same::value; - vgemm.B_t = std::is_same::value; - vgemm.batch_size_last_dim = - std::is_same::value; + Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = std::is_same::value; + vgemm.B_t = std::is_same::value; + vgemm.batch_size_last_dim = std::is_same::value; vgemm.A_c = vgemm.B_c = false; vgemm.A = a_expected; vgemm.B = b_expected; @@ -165,10 +149,8 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, Kokkos::fence(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host Kokkos::deep_copy(c_expected_host, c_expected); @@ -205,26 +187,21 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, EXPECT_NEAR_KK(diff / sum, 0, eps, fmsg + fmsg_rhs); } -template -void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, - const int matBdim1, const int matBdim2, +template +void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, const int matCdim2) { { BatchedGemmHandle batchedGemmHandle; - ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), - BaseHeuristicAlgos::SQUARE); + ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), BaseHeuristicAlgos::SQUARE); ASSERT_EQ(batchedGemmHandle.teamSz, 0); ASSERT_EQ(batchedGemmHandle.vecLen, 0); #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) cublasHandle_t cublas_handle; - BatchedGemmHandle batchedGemmHandleCublas(cublas_handle, - GemmTplAlgos::CUBLAS, 0, 0); + BatchedGemmHandle batchedGemmHandleCublas(cublas_handle, GemmTplAlgos::CUBLAS, 0, 0); ASSERT_EQ(&cublas_handle, batchedGemmHandleCublas.get_tpl_params()); - ASSERT_EQ(batchedGemmHandleCublas.get_kernel_algo_type(), - (int)GemmTplAlgos::CUBLAS); + ASSERT_EQ(batchedGemmHandleCublas.get_kernel_algo_type(), (int)GemmTplAlgos::CUBLAS); ASSERT_EQ(batchedGemmHandleCublas.teamSz, 0); ASSERT_EQ(batchedGemmHandleCublas.vecLen, 0); #endif @@ -232,53 +209,37 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, // FIXME temporary workaround to run this magma test only if cublas is not // enabled the design of the BatchedGemmHandle currently does not allow // simultanous testing in this way. See issue #2177 -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) magma_queue_t magma_queue; - BatchedGemmHandle batchedGemmHandleMagma(magma_queue, GemmTplAlgos::MAGMA, - 0, 0); + BatchedGemmHandle batchedGemmHandleMagma(magma_queue, GemmTplAlgos::MAGMA, 0, 0); ASSERT_EQ(&magma_queue, batchedGemmHandleMagma.get_tpl_params()); - ASSERT_EQ(batchedGemmHandleMagma.get_kernel_algo_type(), - (int)GemmTplAlgos::MAGMA); + ASSERT_EQ(batchedGemmHandleMagma.get_kernel_algo_type(), (int)GemmTplAlgos::MAGMA); ASSERT_EQ(batchedGemmHandleMagma.teamSz, 0); ASSERT_EQ(batchedGemmHandleMagma.vecLen, 0); #endif } - for (int algo_type = BaseHeuristicAlgos::SQUARE; - algo_type < GemmKokkosBatchedAlgos::N; ++algo_type) { + for (int algo_type = BaseHeuristicAlgos::SQUARE; algo_type < GemmKokkosBatchedAlgos::N; ++algo_type) { { try { BatchedGemmHandle batchedGemmHandle(algo_type); ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), algo_type); - if (algo_type == BaseTplAlgos::ARMPL || - algo_type == BaseKokkosBatchedAlgos::KK_SERIAL || - algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 || - algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) { - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 1.5, 3.0); + if (algo_type == BaseTplAlgos::ARMPL || algo_type == BaseKokkosBatchedAlgos::KK_SERIAL || + algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 || algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) { + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 1.5, 3.0); } else if (algo_type == BaseHeuristicAlgos::SQUARE) { // Invoke 4 times to ensure we cover all paths for alpha and beta - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 0.0, 0.0); - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 1.0, 0.0); - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 0.0, 1.0); - impl_test_batched_gemm_with_handle( - &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, - matCdim1, matCdim2, 1.5, 3.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 0.0, 0.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 1.0, 0.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 0.0, 1.0); + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, matCdim1, matCdim2, 1.5, 3.0); } else { try { // Allocate these views to invoke BatchedGemm with an unsupported @@ -291,8 +252,7 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, using bl = typename ParamTagType::batchLayout; ScalarType alpha = 0.34; ScalarType beta = 0.43; - BatchedGemm(&batchedGemmHandle, alpha, a_actual, - b_actual, beta, c_actual); + BatchedGemm(&batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual); std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); FAIL() << fmsg; } catch (const std::runtime_error& error) { @@ -314,26 +274,21 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, } } // namespace Test -template +template void test_batched_gemm_with_layout(int N) { // Square cases { int i = 0; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); i = 10; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); i = 25; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); i = 32; - Test::impl_test_batched_gemm(N, i, i, i, i, i, i); + Test::impl_test_batched_gemm(N, i, i, i, i, i, i); } // Non-square cases @@ -341,63 +296,42 @@ void test_batched_gemm_with_layout(int N) { int dimM = 1 * i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimM, dimK, dimK, dimN, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimM, dimK, dimK, dimN, dimM, + dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimM, dimK, dimN, dimK, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimM, dimK, dimN, dimK, dimM, + dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimK, dimM, dimK, dimN, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimK, dimM, dimK, dimN, dimM, + dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::impl_test_batched_gemm(N, dimK, dimM, dimN, dimK, - dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::impl_test_batched_gemm(N, dimK, dimM, dimN, dimK, dimM, + dimN); } } } -template +template int test_batched_gemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - if constexpr (std::is_same_v) { - using param_tag_type = ::Test::SharedParamTag; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + if constexpr (std::is_same_v) { + using param_tag_type = + ::Test::SharedParamTag; typedef Kokkos::View llVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); } else { std::cerr << "TEST SKIPPED since BatchLayout is not Right." << std::endl; } @@ -406,24 +340,16 @@ int test_batched_gemm() { #endif // KOKKOSKERNELS_INST_LAYOUTLEFT #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - if constexpr (std::is_same_v) { - using param_tag_type = ::Test::SharedParamTag; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + if constexpr (std::is_same_v) { + using param_tag_type = + ::Test::SharedParamTag; typedef Kokkos::View lrVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); } else { std::cerr << "TEST SKIPPED since BatchLayout is not Left." << std::endl; } diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp index 3c58f432ec..4e9bfa42ef 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp @@ -16,139 +16,89 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ -TEST_F(TestCategory, - batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; +TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_right) { + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, Kokkos::complex, - param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ -TEST_F(TestCategory, - batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; +TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right) { + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp index 62a4a291a8..d2e9fe48d7 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp @@ -16,206 +16,140 @@ // We do not ETI half-types. Only test this if ETI ONLY is off // and bhalf_t is not an alias to float. -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT // We do not ETI half-types. Only test this if ETI ONLY is off // and half_t is not an alias to float. -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } @@ -224,59 +158,43 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { #if defined(KOKKOSKERNELS_INST_DOUBLE) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_left) { - using param_tag_type = - ::Test::SharedParamTag; + using param_tag_type = ::Test::SharedParamTag; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_right) { - typedef ::Test::SharedParamTag - param_tag_type; + typedef ::Test::SharedParamTag param_tag_type; test_batched_gemm(); } diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index c1328291fb..f536f220d3 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -20,14 +20,9 @@ namespace KokkosBatched { template -void create_tridiagonal_batched_matrices(const MatrixViewType& A, - const VectorViewType& B) { - Kokkos::Random_XorShift64_Pool< - typename VectorViewType::device_type::execution_space> - random(13718); - Kokkos::fill_random( - B, random, - Kokkos::reduction_identity::prod()); +void create_tridiagonal_batched_matrices(const MatrixViewType& A, const VectorViewType& B) { + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(B, random, Kokkos::reduction_identity::prod()); auto A_host = Kokkos::create_mirror_view(A); @@ -58,8 +53,7 @@ void create_tridiagonal_batched_matrices(const MatrixViewType& A, } template -void create_banded_triangular_matrix(InViewType& in, OutViewType& out, - int k = 1, bool band_storage = true) { +void create_banded_triangular_matrix(InViewType& in, OutViewType& out, int k = 1, bool band_storage = true) { auto h_in = Kokkos::create_mirror_view(in); auto h_out = Kokkos::create_mirror_view(out); const int N = in.extent(0), BlkSize = in.extent(1); diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp index 90ce5addc3..df6f0ee069 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp @@ -36,8 +36,7 @@ struct Functor_TestBatchedSerialAxpy { const ViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialAxpy(const alphaViewType &alpha, const ViewType &X, - const ViewType &Y) + Functor_TestBatchedSerialAxpy(const alphaViewType &alpha, const ViewType &X, const ViewType &Y) : _alpha(alpha), _X(X), _Y(Y) {} KOKKOS_INLINE_FUNCTION @@ -68,13 +67,11 @@ void impl_test_batched_axpy(const int N, const int BlkSize) { typedef typename alphaViewType::const_value_type alpha_const_value_type; typedef Kokkos::ArithTraits ats; - ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), - Y1("y1", N, BlkSize); + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); alphaViewType alpha("alpha", N); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X0, random, const_value_type(1.0)); Kokkos::fill_random(Y0, random, const_value_type(1.0)); Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0)); @@ -94,12 +91,9 @@ void impl_test_batched_axpy(const int N, const int BlkSize) { Kokkos::deep_copy(Y0_host, Y0); for (int l = 0; l < N; ++l) - for (int i = 0; i < BlkSize; ++i) - Y0_host(l, i) += alpha_host(l) * X0_host(l, i); + for (int i = 0; i < BlkSize; ++i) Y0_host(l, i) += alpha_host(l) * X0_host(l, i); - Functor_TestBatchedSerialAxpy(alpha, X1, - Y1) - .run(); + Functor_TestBatchedSerialAxpy(alpha, X1, Y1).run(); Kokkos::fence(); @@ -128,25 +122,20 @@ int test_batched_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Axpy::impl_test_batched_axpy( - 1024, i); + Test::Axpy::impl_test_batched_axpy(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Axpy::impl_test_batched_axpy( - 1024, i); + Test::Axpy::impl_test_batched_axpy(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp index ed647f1e3b..7d1b3301f1 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp @@ -16,8 +16,7 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_dcomplex) { - test_batched_axpy, - Kokkos::complex>(); + test_batched_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_double) { diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp index 3f1f6af2fd..a0c49287f7 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_serial_axpy_nt_float_float) { - test_batched_axpy(); -} +TEST_F(TestCategory, batched_scalar_serial_axpy_nt_float_float) { test_batched_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_axpy_nt_double_double) { - test_batched_axpy(); -} +TEST_F(TestCategory, batched_scalar_serial_axpy_nt_double_double) { test_batched_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index 7f27fa7dcf..144bb2251e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" @@ -37,8 +37,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_TestBatchedSerialGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -46,8 +45,7 @@ struct Functor_TestBatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_TestBatchedSerialGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -57,8 +55,8 @@ struct Functor_TestBatchedSerialGemm { auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + SerialGemm::invoke(_alpha, aa, bb, _beta, + cc); } inline void run() { @@ -73,10 +71,8 @@ struct Functor_TestBatchedSerialGemm { } }; -template -void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, - const int matBdim1, const int matBdim2, +template +void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2, const int matCdim1, const int matCdim2) { using execution_space = typename DeviceType::execution_space; using transA = typename ParamTagType::transA; @@ -88,12 +84,9 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ScalarType alpha = ScalarType(1.5); ScalarType beta = ScalarType(3.0); - ViewType a_expected("a_expected", N, matAdim1, matAdim2), - a_actual("a_actual", N, matAdim1, matAdim2), - b_expected("b_expected", N, matBdim1, matBdim2), - b_actual("b_actual", N, matBdim1, matBdim2), - c_expected("c_expected", N, matCdim1, matCdim2), - c_actual("c_actual", N, matCdim1, matCdim2); + ViewType a_expected("a_expected", N, matAdim1, matAdim2), a_actual("a_actual", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b_actual("b_actual", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c_actual("c_actual", N, matCdim1, matCdim2); Kokkos::Random_XorShift64_Pool random(13718); @@ -107,8 +100,7 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, Kokkos::deep_copy(b_actual, b_expected); Kokkos::deep_copy(c_actual, c_expected); - Functor_BatchedVanillaGEMM - vgemm; + Functor_BatchedVanillaGEMM vgemm; vgemm.A_t = std::is_same::value; vgemm.B_t = std::is_same::value; vgemm.A_c = vgemm.B_c = false; @@ -118,15 +110,12 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, vgemm.alpha = alpha; vgemm.beta = beta; vgemm.run(); // Compute c_expected - Functor_TestBatchedSerialGemm(alpha, a_actual, b_actual, beta, - c_actual) + Functor_TestBatchedSerialGemm(alpha, a_actual, b_actual, + beta, c_actual) .run(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host for comparison Kokkos::deep_copy(c_expected_host, c_expected); @@ -157,57 +146,41 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, } // namespace Gemm } // namespace Test -template +template int test_batched_gemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, - 10, 10, 10); + typedef Kokkos::View ViewType; + Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, 10, + 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Gemm::impl_test_batched_gemm(1024, i, i, - i, i, i, i); + Test::Gemm::impl_test_batched_gemm(1024, i, i, i, i, + i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } @@ -215,52 +188,37 @@ int test_batched_gemm() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, - 10, 10, 10); + typedef Kokkos::View ViewType; + Test::Gemm::impl_test_batched_gemm(0, 10, 10, 10, 10, + 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Gemm::impl_test_batched_gemm(1024, i, i, - i, i, i, i); + Test::Gemm::impl_test_batched_gemm(1024, i, i, i, i, + i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::Gemm::impl_test_batched_gemm( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::Gemm::impl_test_batched_gemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp index f671292c98..f785965602 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp @@ -18,32 +18,24 @@ /// dcomplex, dcomplex TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::Gemm::ParamTag @@ -59,32 +51,24 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex) { /// dcomplex, double TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, param_tag_type, - algo_tag_type>(); + test_batched_gemm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::Gemm::ParamTag diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp index 6f074867d9..afe5744688 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp @@ -15,112 +15,88 @@ //@HEADER #if defined(KOKKOS_BHALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_bhalf_bhalf) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT #if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_half_half) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_float_float) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; test_batched_gemm(); } @@ -128,31 +104,23 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_double_double) { - typedef ::Test::Gemm::ParamTag - param_tag_type; + typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp index bb05fab3bb..8ec0dd8189 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp @@ -32,8 +32,7 @@ using namespace KokkosBatched; namespace Test { namespace Gesv { -template +template struct Functor_TestBatchedSerialGesv { using execution_space = typename DeviceType::execution_space; const MatrixType _A; @@ -42,8 +41,7 @@ struct Functor_TestBatchedSerialGesv { const VectorType _B; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialGesv(const MatrixType &A, const MatrixType &tmp, - const VectorType &X, const VectorType &B) + Functor_TestBatchedSerialGesv(const MatrixType &A, const MatrixType &tmp, const VectorType &X, const VectorType &B) : _A(A), _tmp(tmp), _X(X), _B(B) {} KOKKOS_INLINE_FUNCTION @@ -68,21 +66,18 @@ struct Functor_TestBatchedSerialGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::ArithTraits ats; using MagnitudeType = typename Kokkos::ArithTraits::mag_type; - using NormViewType = - Kokkos::View; + using NormViewType = Kokkos::View; NormViewType sqr_norm_j("sqr_norm_j", N); auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); - MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize), - tmp("tmp", N, BlkSize, BlkSize + 4); + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize), tmp("tmp", N, BlkSize, BlkSize + 4); VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); create_tridiagonal_batched_matrices(A, B); @@ -98,23 +93,18 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedSerialGesv(A, tmp, X, B) - .run(); + Functor_TestBatchedSerialGesv(A, tmp, X, B).run(); Kokkos::fence(); Kokkos::deep_copy(X_host, X); for (int l = 0; l < N; ++l) - KokkosBlas::SerialGemv:: - invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(X_host, l, Kokkos::ALL), 1, - Kokkos::subview(B_host, l, Kokkos::ALL)); + KokkosBlas::SerialGemv::invoke( + -1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); - KokkosBatched::SerialDot::invoke(B_host, B_host, - sqr_norm_j_host); + KokkosBatched::SerialDot::invoke(B_host, B_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); @@ -127,27 +117,21 @@ template int test_batched_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::Gesv::impl_test_batched_gesv(1024, i); + Test::Gesv::impl_test_batched_gesv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::Gesv::impl_test_batched_gesv(1024, i); + Test::Gesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index 23ded73e25..6f11154471 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" #include "KokkosBatched_InverseLU_Decl.hpp" -//#include "KokkosBatched_InverseLU_Serial_Impl.hpp" +// #include "KokkosBatched_InverseLU_Serial_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedSerialGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,8 +49,7 @@ struct Functor_BatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -63,8 +61,8 @@ struct Functor_BatchedSerialGemm { for (int i = 0; i < static_cast(aa.extent(0)); ++i) aa(i, i) += 10.0; - SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + SerialGemm::invoke(_alpha, aa, bb, _beta, + cc); } inline void run() { @@ -108,16 +106,14 @@ struct Functor_BatchedSerialLU { } }; -template +template struct Functor_TestBatchedSerialInverseLU { using execution_space = typename DeviceType::execution_space; AViewType _a; WViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialInverseLU(const AViewType &a, const WViewType &w) - : _a(a), _w(w) {} + Functor_TestBatchedSerialInverseLU(const AViewType &a, const WViewType &w) : _a(a), _w(w) {} KOKKOS_INLINE_FUNCTION void operator()(const int k) const { @@ -139,8 +135,7 @@ struct Functor_TestBatchedSerialInverseLU { } }; -template +template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -151,8 +146,7 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { WViewType w("w", N, BlkSize * BlkSize); AViewType c0("c0", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); @@ -162,16 +156,12 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { Functor_BatchedSerialLU(a1).run(); - Functor_TestBatchedSerialInverseLU(a1, w) - .run(); + Functor_TestBatchedSerialInverseLU(a1, w).run(); value_type alpha = 1.0, beta = 0.0; - typedef SerialInverseLU::ParamTag - param_tag_type; + typedef SerialInverseLU::ParamTag param_tag_type; - Functor_BatchedSerialGemm(alpha, a0, a1, beta, c0) + Functor_BatchedSerialGemm(alpha, a0, a1, beta, c0) .run(); Kokkos::fence(); @@ -202,31 +192,21 @@ template int test_batched_inverselu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::SerialInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::SerialInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialInverseLU::impl_test_batched_inverselu< - DeviceType, AViewType, WViewType, AlgoTagType>(1024, i); + Test::SerialInverseLU::impl_test_batched_inverselu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::SerialInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::SerialInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialInverseLU::impl_test_batched_inverselu< - DeviceType, AViewType, WViewType, AlgoTagType>(1024, i); + Test::SerialInverseLU::impl_test_batched_inverselu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp index 243ed21908..01e6372471 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_serial_inverselu_dcomplex) { // printf("Batched serial inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, - Algo::InverseLU::Unblocked>(); + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched serial inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, - Algo::InverseLU::Blocked>(); + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialLU.hpp b/batched/dense/unit_test/Test_Batched_SerialLU.hpp index 87224aa888..33e079dd9b 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" @@ -67,16 +67,14 @@ void impl_test_batched_lu(const int N, const int BlkSize) { /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); Kokkos::deep_copy(a1, a0); - Functor_TestBatchedSerialLU(a0) - .run(); + Functor_TestBatchedSerialLU(a0).run(); Functor_TestBatchedSerialLU(a1).run(); Kokkos::fence(); @@ -107,8 +105,7 @@ template int test_batched_lu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); @@ -118,8 +115,7 @@ int test_batched_lu() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp index 6ee7818ddc..11274fc311 100644 --- a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp @@ -27,16 +27,14 @@ using namespace KokkosBatched; namespace Test { namespace Pttrf { -template +template struct Functor_BatchedSerialPttrf { using execution_space = typename DeviceType::execution_space; DViewType _d; EViewType _e; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) - : _d(d), _e(e) {} + Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) : _d(d), _e(e) {} KOKKOS_INLINE_FUNCTION void operator()(const int k, int &info) const { @@ -60,8 +58,8 @@ struct Functor_BatchedSerialPttrf { } }; -template +template struct Functor_BatchedSerialGemm { using execution_space = typename DeviceType::execution_space; AViewType _a; @@ -70,8 +68,7 @@ struct Functor_BatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, - const BViewType &b, const ScalarType beta, + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, const BViewType &b, const ScalarType beta, const CViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -81,9 +78,7 @@ struct Functor_BatchedSerialGemm { auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, - _beta, cc); + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, _beta, cc); } inline void run() { @@ -96,8 +91,7 @@ struct Functor_BatchedSerialGemm { } }; -template +template /// \brief Implementation details of batched pttrf test for random matrix /// /// \param N [in] Batch size of matrix A @@ -109,16 +103,13 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { using View2DType = Kokkos::View; using View3DType = Kokkos::View; - View3DType A("A", N, BlkSize, BlkSize), - A_reconst("A_reconst", N, BlkSize, BlkSize); - View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), - D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), - L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), D("D", N, BlkSize, BlkSize), + LD("LD", N, BlkSize, BlkSize), L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); RealView2DType d("d", N, BlkSize), // Diagonal components ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); - View2DType e_upper("e_upper", N, BlkSize - 1), - e_lower("e_lower", N, - BlkSize - 1); // upper and lower diagonal components + View2DType e_upper("e_upper", N, BlkSize - 1), e_lower("e_lower", N, + BlkSize - 1); // upper and lower diagonal components using execution_space = typename DeviceType::execution_space; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -129,19 +120,16 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); // Add BlkSize to ensure positive definiteness - Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, - realRandEnd + BlkSize); + Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, realRandEnd + BlkSize); Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd); - auto h_e_upper = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); + auto h_e_upper = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); auto h_e_lower = Kokkos::create_mirror_view(e_lower); for (int ib = 0; ib < N; ib++) { for (int i = 0; i < BlkSize - 1; i++) { // Fill the lower diagonal with conjugate of the upper diagonal - h_e_lower(ib, i) = - Kokkos::ArithTraits::conj(h_e_upper(ib, i)); + h_e_lower(ib, i) = Kokkos::ArithTraits::conj(h_e_upper(ib, i)); } } @@ -157,23 +145,21 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { // Matrix matrix addition by Gemm // D + EU by D * I + EU (result stored in EU) - Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + Functor_BatchedSerialGemm(1.0, D, I, + 1.0, EU) .run(); // Copy EL to A Kokkos::deep_copy(A, EL); // EU + EL by EU * I + A (result stored in A) - Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + Functor_BatchedSerialGemm(1.0, EU, I, + 1.0, A) .run(); // Factorize matrix A -> L * D * L**H // d and e are updated by pttrf - auto info = Functor_BatchedSerialPttrf(d, e_lower) - .run(); + auto info = Functor_BatchedSerialPttrf(d, e_lower).run(); Kokkos::fence(); @@ -189,14 +175,14 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { Kokkos::deep_copy(L, I); // EL + I by EL * I + L (result stored in L) - Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + Functor_BatchedSerialGemm(1.0, EL, I, + 1.0, L) .run(); // Reconstruct A by L*D*L**H // Gemm to compute L*D -> LD - Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + Functor_BatchedSerialGemm(1.0, L, D, + 0.0, LD) .run(); // FIXME: We should use SerialGemm Trans::ConjTranspose. @@ -222,9 +208,8 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { Kokkos::deep_copy(L, h_L); // Gemm to compute (L*D)*(conj(L))**T -> A_reconst - Functor_BatchedSerialGemm(1.0, LD, L, 0.0, - A_reconst) + Functor_BatchedSerialGemm( + 1.0, LD, L, 0.0, A_reconst) .run(); Kokkos::fence(); @@ -232,9 +217,8 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { // this eps is about 10^-14 RealType eps = 1.0e3 * ats::epsilon(); - auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); - auto h_A_reconst = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); // Check A = L*D*L**H for (int ib = 0; ib < N; ib++) { @@ -246,8 +230,7 @@ void impl_test_batched_pttrf(const int N, const int BlkSize) { } } -template +template /// \brief Implementation details of batched pttrf test for early return /// BlkSize must be 0 or 1 /// @@ -263,8 +246,7 @@ void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0; - RealView2DType d("d", N, BlkSize), - d2("d2", N, BlkSize); // Diagonal components + RealView2DType d("d", N, BlkSize), d2("d2", N, BlkSize); // Diagonal components View2DType e("e", N, BlkSize_minus_1); // lower diagonal components @@ -277,14 +259,10 @@ void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { // Factorize matrix A -> L * D * L**H // d and e are updated by pttrf // Early return if BlkSize is 0 or 1 - auto info = Functor_BatchedSerialPttrf(d, e) - .run(); + auto info = Functor_BatchedSerialPttrf(d, e).run(); // For negative values, info should be 1 for BlkSize = 1 - auto info2 = Functor_BatchedSerialPttrf(d2, e) - .run(); + auto info2 = Functor_BatchedSerialPttrf(d2, e).run(); Kokkos::fence(); @@ -307,8 +285,7 @@ void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { } } -template +template /// \brief Implementation details of batched pttrf test /// /// \param N [in] Batch size of matrix A @@ -320,11 +297,9 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { using View2DType = Kokkos::View; using View3DType = Kokkos::View; - View3DType A("A", N, BlkSize, BlkSize), - A_reconst("A_reconst", N, BlkSize, BlkSize); - View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), - D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), - L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), D("D", N, BlkSize, BlkSize), + LD("LD", N, BlkSize, BlkSize), L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, BlkSize), // Diagonal components ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); @@ -344,23 +319,21 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { // Matrix matrix addition by Gemm // D + EU by D * I + EU (result stored in EU) - Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + Functor_BatchedSerialGemm(1.0, D, I, + 1.0, EU) .run(); // Copy EL to A Kokkos::deep_copy(A, EL); // EU + EL by EU * I + A (result stored in A) - Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + Functor_BatchedSerialGemm(1.0, EU, I, + 1.0, A) .run(); // Factorize matrix A -> L * D * L**T // d and e are updated by pttrf - auto info = Functor_BatchedSerialPttrf(d, e) - .run(); + auto info = Functor_BatchedSerialPttrf(d, e).run(); Kokkos::fence(); @@ -376,20 +349,19 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { Kokkos::deep_copy(L, I); // EL + I by EL * I + L (result stored in L) - Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + Functor_BatchedSerialGemm(1.0, EL, I, + 1.0, L) .run(); // Reconstruct A by L*D*L**T // Gemm to compute L*D -> LD - Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + Functor_BatchedSerialGemm(1.0, L, D, + 0.0, LD) .run(); // Gemm to compute (L*D)*L**T -> A_reconst - Functor_BatchedSerialGemm(1.0, LD, L, 0.0, - A_reconst) + Functor_BatchedSerialGemm( + 1.0, LD, L, 0.0, A_reconst) .run(); Kokkos::fence(); @@ -397,9 +369,8 @@ void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { // this eps is about 10^-14 RealType eps = 1.0e3 * ats::epsilon(); - auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); - auto h_A_reconst = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); // Check A = L*D*L.T for (int ib = 0; ib < N; ib++) { @@ -420,22 +391,14 @@ int test_batched_pttrf() { { using LayoutType = Kokkos::LayoutLeft; for (int i = 0; i < 2; i++) { - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(2, i); } for (int i = 2; i < 10; i++) { - Test::Pttrf::impl_test_batched_pttrf(1, i); - Test::Pttrf::impl_test_batched_pttrf(2, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 1, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 2, i); + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(2, i); } } #endif @@ -443,22 +406,14 @@ int test_batched_pttrf() { { using LayoutType = Kokkos::LayoutRight; for (int i = 0; i < 2; i++) { - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); - Test::Pttrf::impl_test_batched_pttrf_quick_return< - DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return(2, i); } for (int i = 2; i < 10; i++) { - Test::Pttrf::impl_test_batched_pttrf(1, i); - Test::Pttrf::impl_test_batched_pttrf(2, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 1, i); - Test::Pttrf::impl_test_batched_pttrf_analytical( - 2, i); + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical(2, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index 099fa9219f..9bf9d43578 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -70,8 +70,7 @@ void verifyOrthogonal(const Mat& X) { } template -void verifySVD(const AView& A, const UView& U, const VtView& Vt, - const SigmaView& sigma) { +void verifySVD(const AView& A, const UView& U, const VtView& Vt, const SigmaView& sigma) { using Scalar = typename AView::non_const_value_type; using KAT = Kokkos::ArithTraits; // Check that U/V columns are unit length and orthogonal, and that U * @@ -85,10 +84,8 @@ void verifySVD(const AView& A, const UView& U, const VtView& Vt, verifyOrthogonal(Vt); Kokkos::View usvt("USV^T", m, n); for (int i = 0; i < maxrank; i++) { - auto Ucol = - Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair(i, i + 1)); - auto Vtrow = - Kokkos::subview(Vt, Kokkos::make_pair(i, i + 1), Kokkos::ALL()); + auto Ucol = Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair(i, i + 1)); + auto Vtrow = Kokkos::subview(Vt, Kokkos::make_pair(i, i + 1), Kokkos::ALL()); Test::vanillaGEMM(sigma(i), Ucol, Vtrow, 1.0, usvt); } for (int i = 0; i < m; i++) { @@ -113,8 +110,7 @@ Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0) { auto mhost = Kokkos::create_mirror_view(mat); // Fill mat with random values first if (maxval != 0.0) { - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Scalar minrand, maxrand; Test::getRandomBounds(maxval, minrand, maxrand); Kokkos::fill_random(mhost, rand_pool, minrand, maxrand); @@ -143,15 +139,14 @@ Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0) { template struct SerialSVDFunctor_Full { - SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_, - const Vector& sigma_, const Vector& work_) + SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_, const Vector& sigma_, + const Vector& work_) : A(A_), U(U_), Vt(Vt_), sigma(sigma_), work(work_) {} // NOTE: this functor is only meant to be launched with a single element range // policy KOKKOS_INLINE_FUNCTION void operator()(int) const { - KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag(), A, U, sigma, - Vt, work); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag(), A, U, sigma, Vt, work); } Matrix A; @@ -163,15 +158,13 @@ struct SerialSVDFunctor_Full { template struct SerialSVDFunctor_SingularValuesOnly { - SerialSVDFunctor_SingularValuesOnly(const Matrix& A_, const Vector& sigma_, - const Vector& work_) + SerialSVDFunctor_SingularValuesOnly(const Matrix& A_, const Vector& sigma_, const Vector& work_) : A(A_), sigma(sigma_), work(work_) {} // NOTE: this functor is only meant to be launched with a single element range // policy KOKKOS_INLINE_FUNCTION void operator()(int) const { - KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_S_Tag(), A, sigma, - work); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_S_Tag(), A, sigma, work); } Matrix A; @@ -201,14 +194,12 @@ void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0) { typename Matrix::HostMirror Acopy("Acopy", m, n); Kokkos::deep_copy(Acopy, A); // Run the SVD - Kokkos::parallel_for( - Kokkos::RangePolicy(0, 1), - SerialSVDFunctor_Full(A, U, Vt, sigma, work)); + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + SerialSVDFunctor_Full(A, U, Vt, sigma, work)); // Get the results back - auto Uhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U); - auto Vthost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Vt); - auto sigmaHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma); + auto Uhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U); + auto Vthost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Vt); + auto sigmaHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma); // Verify the SVD is correct verifySVD(Acopy, Uhost, Vthost, sigmaHost); } @@ -237,22 +228,17 @@ void testSerialSVDSingularValuesOnly(int m, int n) { typename Matrix::HostMirror Acopy("Acopy", m, n); Kokkos::deep_copy(Acopy, A); // Run the SVD (full mode) - Kokkos::parallel_for( - Kokkos::RangePolicy(0, 1), - SerialSVDFunctor_Full(A, U, Vt, sigma1, work)); + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + SerialSVDFunctor_Full(A, U, Vt, sigma1, work)); Kokkos::deep_copy(A, Acopy); // Run the same SVD (singular values only mode) - Kokkos::parallel_for( - Kokkos::RangePolicy(0, 1), - SerialSVDFunctor_SingularValuesOnly(A, sigma2, work)); - auto sigma1Host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma1); - auto sigma2Host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma2); + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + SerialSVDFunctor_SingularValuesOnly(A, sigma2, work)); + auto sigma1Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma1); + auto sigma2Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma2); // Make sure they match for (int i = 0; i < maxrank; i++) { - Test::EXPECT_NEAR_KK(sigma1Host(i), sigma2Host(i), - Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(sigma1Host(i), sigma2Host(i), Test::svdEpsilon()); } } @@ -279,9 +265,8 @@ void testSerialSVDZeroLastRow(int n) { Matrix BVt("UBVt", n, n); Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt); // Run the routine (just on host) - KokkosBatched::SerialSVDInternal::svdZeroLastColumn( - B.data(), n, B.stride(0), B.stride(1), Vt.data(), Vt.stride(0), - Vt.stride(1)); + KokkosBatched::SerialSVDInternal::svdZeroLastColumn(B.data(), n, B.stride(0), B.stride(1), Vt.data(), + Vt.stride(0), Vt.stride(1)); // Check that B is still bidiagonal (to a tight tolerance, but not exactly // zero) for (int i = 0; i < n; i++) { @@ -292,8 +277,7 @@ void testSerialSVDZeroLastRow(int n) { } } // Check that the last superdiagonal is now zero - Test::EXPECT_NEAR_KK(B(n - 2, n - 1), KAT::zero(), - Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(B(n - 2, n - 1), KAT::zero(), Test::svdEpsilon()); // Check that the product is still maintained Matrix BVt2("UBVt", n, n); Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt2); @@ -312,8 +296,8 @@ void testSerialSVDZeroDiagonal(int n, int row) { // Generate a bidiagonal matrix using Matrix = Kokkos::View; using KAT = Kokkos::ArithTraits; - int m = n + 2; // Make U somewhat bigger to make sure the Givens transforms - // are applied correctly + int m = n + 2; // Make U somewhat bigger to make sure the Givens transforms + // are applied correctly Matrix B = createRandomMatrix(m, n, 0, 1.0); // Zero out entries to make B bidiagonal for (int i = 0; i < m; i++) { @@ -331,9 +315,8 @@ void testSerialSVDZeroDiagonal(int n, int row) { Matrix UB("UB", m, n); Test::vanillaGEMM(1.0, U, B, 0.0, UB); // Run the routine (just on host) - KokkosBatched::SerialSVDInternal::svdZeroRow( - row, B.data(), n, B.stride(0), B.stride(1), U.data(), m, U.stride(0), - U.stride(1)); + KokkosBatched::SerialSVDInternal::svdZeroRow(row, B.data(), n, B.stride(0), B.stride(1), U.data(), m, + U.stride(0), U.stride(1)); // Check that B is still bidiagonal (to a tight tolerance, but not exactly // zero) for (int i = 0; i < m; i++) { @@ -381,12 +364,9 @@ void testSVD() { template KOKKOS_INLINE_FUNCTION constexpr auto Determinant(ViewT F) - -> std::enable_if_t::value && ViewT::rank == 2, - double> { - return (F(0, 0) * F(1, 1) * F(2, 2) + F(0, 1) * F(1, 2) * F(2, 0) + - F(0, 2) * F(1, 0) * F(2, 1) - - (F(0, 2) * F(1, 1) * F(2, 0) + F(0, 1) * F(1, 0) * F(2, 2) + - F(0, 0) * F(1, 2) * F(2, 1))); + -> std::enable_if_t::value && ViewT::rank == 2, double> { + return (F(0, 0) * F(1, 1) * F(2, 2) + F(0, 1) * F(1, 2) * F(2, 0) + F(0, 2) * F(1, 0) * F(2, 1) - + (F(0, 2) * F(1, 1) * F(2, 0) + F(0, 1) * F(1, 0) * F(2, 2) + F(0, 0) * F(1, 2) * F(2, 1))); } template @@ -411,39 +391,31 @@ void testIssue1786() { using execution_space = typename Device::execution_space; using memory_space = typename Device::memory_space; constexpr int num_tests = 4; - Kokkos::View matrices("data", - num_tests); + Kokkos::View matrices("data", num_tests); GenerateTestData(matrices); - Kokkos::View Us("Us", - matrices.extent(0)); - Kokkos::View Ss("Ss", matrices.extent(0)); - Kokkos::View Vts("Vts", - matrices.extent(0)); + Kokkos::View Us("Us", matrices.extent(0)); + Kokkos::View Ss("Ss", matrices.extent(0)); + Kokkos::View Vts("Vts", matrices.extent(0)); // Make sure the 2nd dimension of works is contiguous - Kokkos::View works( - "works", matrices.extent(0)); - Kokkos::View matrices_copy( - "matrices_copy", matrices.extent(0)); + Kokkos::View works("works", matrices.extent(0)); + Kokkos::View matrices_copy("matrices_copy", matrices.extent(0)); // make a copy of the input data to avoid overwriting it Kokkos::deep_copy(matrices_copy, matrices); auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); Kokkos::parallel_for( "polar decomposition", policy, KOKKOS_LAMBDA(int i) { - auto matrix_copy = - Kokkos::subview(matrices_copy, i, Kokkos::ALL(), Kokkos::ALL()); - auto U = Kokkos::subview(Us, i, Kokkos::ALL(), Kokkos::ALL()); - auto S = Kokkos::subview(Ss, i, Kokkos::ALL()); - auto Vt = Kokkos::subview(Vts, i, Kokkos::ALL(), Kokkos::ALL()); - auto work = Kokkos::subview(works, i, Kokkos::ALL()); - KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag{}, - matrix_copy, U, S, Vt, work); + auto matrix_copy = Kokkos::subview(matrices_copy, i, Kokkos::ALL(), Kokkos::ALL()); + auto U = Kokkos::subview(Us, i, Kokkos::ALL(), Kokkos::ALL()); + auto S = Kokkos::subview(Ss, i, Kokkos::ALL()); + auto Vt = Kokkos::subview(Vts, i, Kokkos::ALL(), Kokkos::ALL()); + auto work = Kokkos::subview(works, i, Kokkos::ALL()); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag{}, matrix_copy, U, S, Vt, work); }); - auto Us_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Us); - auto Ss_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Ss); - auto Vts_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Vts); - auto matrices_h = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, matrices); + auto Us_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Us); + auto Ss_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Ss); + auto Vts_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Vts); + auto matrices_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, matrices); for (int i = 0; i < num_tests; i++) { auto A = Kokkos::subview(matrices_h, i, Kokkos::ALL(), Kokkos::ALL()); auto U = Kokkos::subview(Us_h, i, Kokkos::ALL(), Kokkos::ALL()); diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp index 43cb8fab2f..734eda28bd 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" #include "KokkosBatched_SolveLU_Decl.hpp" -//#include "KokkosBatched_SolveLU_Serial_Impl.hpp" +// #include "KokkosBatched_SolveLU_Serial_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedSerialGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,8 +49,7 @@ struct Functor_BatchedSerialGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} @@ -63,8 +61,8 @@ struct Functor_BatchedSerialGemm { for (int i = 0; i < static_cast(aa.extent(0)); ++i) aa(i, i) += 10.0; - SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + SerialGemm::invoke(_alpha, aa, bb, _beta, + cc); } inline void run() { @@ -108,16 +106,14 @@ struct Functor_BatchedSerialLU { } }; -template +template struct Functor_TestBatchedSerialSolveLU { using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialSolveLU(const ViewType &a, const ViewType &b) - : _a(a), _b(b) {} + Functor_TestBatchedSerialSolveLU(const ViewType &a, const ViewType &b) : _a(a), _b(b) {} KOKKOS_INLINE_FUNCTION void operator()(const int k) const { @@ -152,8 +148,7 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { // ViewType a0_T("a0_T", N, BlkSize, BlkSize); // ViewType b_T ("b_T", N, BlkSize, 5 ); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(x0, random, value_type(1.0)); @@ -165,15 +160,12 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { value_type alpha = 1.0, beta = 0.0; typedef ParamTag param_tag_type; - Functor_BatchedSerialGemm(alpha, a0, x0, beta, b) + Functor_BatchedSerialGemm(alpha, a0, x0, beta, b) .run(); Functor_BatchedSerialLU(a1).run(); - Functor_TestBatchedSerialSolveLU(a1, b) - .run(); + Functor_TestBatchedSerialSolveLU(a1, b).run(); Kokkos::fence(); @@ -230,25 +222,19 @@ template int test_batched_solvelu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); + Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::SerialSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); + Test::SerialSolveLU::impl_test_batched_solvelu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp index 6eaf9ca5aa..66a99e28d2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_serial_solvelu_dcomplex) { // printf("Batched serial solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_solvelu, - Algo::SolveLU::Unblocked>(); + test_batched_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched serial solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_solvelu, - Algo::SolveLU::Blocked>(); + test_batched_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp index 572e02053b..cd52235dd6 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp @@ -34,8 +34,8 @@ struct ParamTag { using diag = D; }; -template +template struct Functor_BatchedSerialTrsv { using execution_space = typename DeviceType::execution_space; AViewType _a; @@ -44,8 +44,7 @@ struct Functor_BatchedSerialTrsv { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialTrsv(const ScalarType alpha, const AViewType &a, - const BViewType &b) + Functor_BatchedSerialTrsv(const ScalarType alpha, const AViewType &a, const BViewType &b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -53,9 +52,8 @@ struct Functor_BatchedSerialTrsv { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); - KokkosBatched::SerialTrsv< - typename ParamTagType::uplo, typename ParamTagType::trans, - typename ParamTagType::diag, AlgoTagType>::invoke(_alpha, aa, bb); + KokkosBatched::SerialTrsv::invoke(_alpha, aa, bb); } inline void run() { @@ -68,8 +66,7 @@ struct Functor_BatchedSerialTrsv { } }; -template +template struct Functor_BatchedSerialTbsv { using execution_space = typename DeviceType::execution_space; AViewType _a; @@ -77,17 +74,15 @@ struct Functor_BatchedSerialTbsv { int _k; KOKKOS_INLINE_FUNCTION - Functor_BatchedSerialTbsv(const AViewType &a, const BViewType &b, const int k) - : _a(a), _b(b), _k(k) {} + Functor_BatchedSerialTbsv(const AViewType &a, const BViewType &b, const int k) : _a(a), _b(b), _k(k) {} KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const int k) const { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); - KokkosBatched::SerialTbsv< - typename ParamTagType::uplo, typename ParamTagType::trans, - typename ParamTagType::diag, AlgoTagType>::invoke(aa, bb, _k); + KokkosBatched::SerialTbsv::invoke(aa, bb, _k); } inline void run() { @@ -102,8 +97,7 @@ struct Functor_BatchedSerialTbsv { } }; -template +template /// \brief Implementation details of batched tbsv test /// /// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) @@ -111,8 +105,8 @@ template ; - using View3DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; // Reference is created by trsv from triangular matrix View3DType A("A", N, BlkSize, BlkSize), Ref("Ref", N, BlkSize, BlkSize); @@ -128,22 +122,16 @@ void impl_test_batched_tbsv(const int N, const int k, const int BlkSize) { Kokkos::deep_copy(x1, x0); // Create triangluar or banded matrix - create_banded_triangular_matrix(Ref, A, k, - false); - create_banded_triangular_matrix(Ref, Ab, k, - true); + create_banded_triangular_matrix(Ref, A, k, false); + create_banded_triangular_matrix(Ref, Ab, k, true); // Reference trsv - Functor_BatchedSerialTrsv(1.0, A, x0) + Functor_BatchedSerialTrsv(1.0, A, + x0) .run(); // tbsv - Functor_BatchedSerialTbsv(Ab, x1, k) - .run(); + Functor_BatchedSerialTbsv(Ab, x1, k).run(); Kokkos::fence(); @@ -162,17 +150,15 @@ void impl_test_batched_tbsv(const int N, const int k, const int BlkSize) { } } -template +template /// \brief Implementation details of batched tbsv test /// /// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) void impl_test_batched_tbsv_analytical(const std::size_t N) { - using execution_space = typename DeviceType::execution_space; - using View2DType = Kokkos::View; - using StridedView2DType = - Kokkos::View; - using View3DType = Kokkos::View; + using execution_space = typename DeviceType::execution_space; + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using View3DType = Kokkos::View; // Reference is created by trsv from triangular matrix constexpr std::size_t BlkSize = 3, k = 2, incx = 2; @@ -187,8 +173,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { Kokkos::RangePolicy policy(0, N); Kokkos::parallel_for( - "KokkosBatched::Test::SerialTbsv::Initialize", policy, - KOKKOS_LAMBDA(const std::size_t ib) { + "KokkosBatched::Test::SerialTbsv::Initialize", policy, KOKKOS_LAMBDA(const std::size_t ib) { for (std::size_t i = 0; i < BlkSize; i++) { for (std::size_t j = 0; j < BlkSize; j++) { ref(ib, i, j) = i + 1; @@ -199,10 +184,8 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { x1(ib, j) = 1; } - if (std::is_same_v) { - if (std::is_same_v) { + if (std::is_same_v) { + if (std::is_same_v) { if (std::is_same_v) { x_ref(ib, 0) = 1.0 / 2.0; x_ref(ib, 1) = 1.0 / 6.0; @@ -224,8 +207,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { } } } else { - if (std::is_same_v) { + if (std::is_same_v) { if (std::is_same_v) { x_ref(ib, 0) = 1.0; x_ref(ib, 1) = -1.0 / 2.0; @@ -252,22 +234,14 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { Kokkos::fence(); // Create triangluar or banded matrix - create_banded_triangular_matrix(ref, A, k, - false); - create_banded_triangular_matrix(ref, Ab, k, - true); + create_banded_triangular_matrix(ref, A, k, false); + create_banded_triangular_matrix(ref, Ab, k, true); // tbsv - Functor_BatchedSerialTbsv(Ab, x0, k) - .run(); + Functor_BatchedSerialTbsv(Ab, x0, k).run(); // tbsv with incx == 2 - Functor_BatchedSerialTbsv(Ab, x1, k) - .run(); + Functor_BatchedSerialTbsv(Ab, x1, k).run(); Kokkos::fence(); @@ -280,8 +254,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { // Pack x1 into x0 for contiguous storage Kokkos::parallel_for( - "KokkosBatched::Test::SerialTbsv::Copy", policy, - KOKKOS_LAMBDA(const std::size_t ib) { + "KokkosBatched::Test::SerialTbsv::Copy", policy, KOKKOS_LAMBDA(const std::size_t ib) { for (std::size_t j = 0; j < BlkSize; j++) { x0(ib, j) = x1(ib, j); } @@ -295,8 +268,7 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { using mag_type = typename ats::mag_type; mag_type eps = 1.0e3 * ats::epsilon(); - auto h_x_ref = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); + auto h_x_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); for (std::size_t ib = 0; ib < N; ib++) { for (std::size_t j = 0; j < BlkSize; j++) { // Check x0 = x_ref @@ -311,36 +283,27 @@ void impl_test_batched_tbsv_analytical(const std::size_t N) { } // namespace Tbsv } // namespace Test -template +template int test_batched_tbsv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { using LayoutType = Kokkos::LayoutLeft; - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); - Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + Test::Tbsv::impl_test_batched_tbsv_analytical(0); + Test::Tbsv::impl_test_batched_tbsv_analytical(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); for (int i = 0; i < 10; i++) { - Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { using LayoutType = Kokkos::LayoutRight; - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0); - Test::Tbsv::impl_test_batched_tbsv_analytical< - DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1); - Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); + Test::Tbsv::impl_test_batched_tbsv_analytical(0); + Test::Tbsv::impl_test_batched_tbsv_analytical(1); + Test::Tbsv::impl_test_batched_tbsv(0, 1, 10); for (int i = 0; i < 10; i++) { - Test::Tbsv::impl_test_batched_tbsv(1, 1, i); + Test::Tbsv::impl_test_batched_tbsv(1, 1, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp index 8789cc6931..005a6e92c0 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp @@ -17,69 +17,53 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } // TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_t_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_l_t_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_u_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_n_dcomplex) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; - test_batched_tbsv, param_tag_type, - algo_tag_type>(); + test_batched_tbsv, param_tag_type, algo_tag_type>(); } /* [FIXME] These tests need Trans::ConjTranspose in trsv. diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp index 8915b4ad05..c8f10adf5c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp @@ -17,59 +17,51 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) // NO TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } // TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_t_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_t_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_u_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_n_float) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } @@ -78,59 +70,51 @@ TEST_F(TestCategory, batched_serial_tbsv_u_t_n_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } // TRANSPOSE TEST_F(TestCategory, batched_serial_tbsv_l_t_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_l_t_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_u_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } TEST_F(TestCategory, batched_serial_tbsv_u_t_n_double) { - using param_tag_type = - ::Test::Tbsv::ParamTag; - using algo_tag_type = typename Algo::Tbsv::Unblocked; + using param_tag_type = ::Test::Tbsv::ParamTag; + using algo_tag_type = typename Algo::Tbsv::Unblocked; test_batched_tbsv(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp index 7a7e89ebf8..610f9e700a 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp @@ -49,8 +49,7 @@ struct NonUnitDiagTRMM { KOKKOS_INLINE_FUNCTION void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -67,12 +66,9 @@ struct VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -110,8 +106,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrmm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -119,8 +114,7 @@ struct Functor_TestBatchedSerialTrmm { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialTrmm(const ScalarType alpha, const ViewType& a, - const ViewType& b) + Functor_TestBatchedSerialTrmm(const ScalarType alpha, const ViewType& a, const ViewType& b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -128,9 +122,8 @@ struct Functor_TestBatchedSerialTrmm { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(_alpha, aa, bb); + SerialTrmm::invoke(_alpha, aa, bb); } inline void run() { @@ -145,10 +138,8 @@ struct Functor_TestBatchedSerialTrmm { } }; -template -void impl_test_batched_trmm(const int N, const int nRows, const int nCols, - const char* trans) { +template +void impl_test_batched_trmm(const int N, const int nRows, const int nCols, const char* trans) { typedef typename ViewType::value_type value_type; typedef typename DeviceType::execution_space execution_space; typedef Kokkos::ArithTraits ats; @@ -156,56 +147,40 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, ScalarType alpha(1.0); ScalarType beta(0.0); - const bool is_side_right = - std::is_same::value; - const bool is_A_lower = - std::is_same::value; - const int K = is_side_right ? nCols : nRows; - ViewType A("A", N, K, K), B_actual("B_actual", N, nRows, nCols), - B_expected("B_expected", N, nRows, nCols); - typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); - typename ViewType::HostMirror B_actual_host = - Kokkos::create_mirror_view(B_actual); - typename ViewType::HostMirror B_expected_host = - Kokkos::create_mirror_view(B_expected); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); - - using ViewTypeSubA = - decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); - using ViewTypeSubB = - decltype(Kokkos::subview(B_actual, 0, Kokkos::ALL(), Kokkos::ALL())); + const bool is_side_right = std::is_same::value; + const bool is_A_lower = std::is_same::value; + const int K = is_side_right ? nCols : nRows; + ViewType A("A", N, K, K), B_actual("B_actual", N, nRows, nCols), B_expected("B_expected", N, nRows, nCols); + typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror B_actual_host = Kokkos::create_mirror_view(B_actual); + typename ViewType::HostMirror B_expected_host = Kokkos::create_mirror_view(B_expected); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + + using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); + using ViewTypeSubB = decltype(Kokkos::subview(B_actual, 0, Kokkos::ALL(), Kokkos::ALL())); Kokkos::Random_XorShift64_Pool rand_pool(seed); if (std::is_same::value) { // Initialize A with deterministic random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = UnitDiagTRMM; for (int k = 0; k < N; ++k) { functor_type udtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", - Kokkos::RangePolicy(0, K), udtrmm); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", Kokkos::RangePolicy(0, K), udtrmm); } } else { //(diag[0]=='N')||(diag[0]=='n') // Initialize A with random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = NonUnitDiagTRMM; for (int k = 0; k < N; ++k) { functor_type nudtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", - Kokkos::RangePolicy(0, K), nudtrmm); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy(0, K), nudtrmm); } } - Kokkos::fill_random(B_actual, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(B_actual, rand_pool, Kokkos::rand, ScalarType>::max()); Kokkos::fence(); Kokkos::deep_copy(B_expected, B_actual); @@ -227,9 +202,7 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, if (!is_side_right) { // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C - struct VanillaGEMM - vgemm; + struct VanillaGEMM vgemm; vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n'); vgemm.B_t = false; vgemm.A_c = (trans[0] == 'C') || (trans[0] == 'c'); @@ -244,15 +217,12 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, ; vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL()); ; - Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", - Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); + Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", + Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); } } else { // B_expected = alpha * B * op(A) + beta * C = 1 * B * op(A) + 0 * C - struct VanillaGEMM - vgemm; + struct VanillaGEMM vgemm; vgemm.A_t = false; vgemm.B_t = (trans[0] != 'N') && (trans[0] != 'n'); vgemm.A_c = false; @@ -267,14 +237,13 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, ; vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL()); ; - Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", - Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); + Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", + Kokkos::TeamPolicy(nRows, Kokkos::AUTO, 16), vgemm); } } - Functor_TestBatchedSerialTrmm(alpha, A, B_actual) + Functor_TestBatchedSerialTrmm(alpha, A, + B_actual) .run(); Kokkos::fence(); @@ -308,50 +277,35 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, } // namespace Trmm } // namespace Test -template +template int test_batched_trmm(int batchSize = 512) { - char trans = - std::is_same::value - ? 'N' - : std::is_same::value - ? 'T' - : std::is_same::value - ? 'C' - : 'E'; + char trans = std::is_same::value ? 'N' + : std::is_same::value ? 'T' + : std::is_same::value ? 'C' + : 'E'; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - Test::Trmm::impl_test_batched_trmm(0, 10, 4, - &trans); + Test::Trmm::impl_test_batched_trmm(0, 10, 4, &trans); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 4, &trans); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 1, &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 4, + &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 1, + &trans); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trmm::impl_test_batched_trmm(0, 10, 4, - &trans); + typedef Kokkos::View ViewType; + Test::Trmm::impl_test_batched_trmm(0, 10, 4, &trans); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 4, &trans); - Test::Trmm::impl_test_batched_trmm( - batchSize, i, 1, &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 4, + &trans); + Test::Trmm::impl_test_batched_trmm(batchSize, i, 1, + &trans); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp index 8ab6e2810c..2d9eab7c4c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp @@ -17,353 +17,227 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_scomplex_scomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp index 1cfc259dd3..10a4f38ed2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp @@ -17,147 +17,111 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; test_batched_trmm(); @@ -167,167 +131,113 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) // NO TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp index f9418a804a..62f4b4de69 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp @@ -37,8 +37,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrsm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -46,8 +45,7 @@ struct Functor_TestBatchedSerialTrsm { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialTrsm(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedSerialTrsm(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -55,9 +53,8 @@ struct Functor_TestBatchedSerialTrsm { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - SerialTrsm::invoke(_alpha, aa, bb); + SerialTrsm::invoke(_alpha, aa, bb); } inline void run() { @@ -72,8 +69,7 @@ struct Functor_TestBatchedSerialTrsm { } }; -template +template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -81,15 +77,13 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { /// randomized input testing views ScalarType alpha(1.0); - const bool is_side_right = - std::is_same::value; - const int b_nrows = is_side_right ? NumCols : BlkSize; - const int b_ncols = is_side_right ? BlkSize : NumCols; - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, b_nrows, b_ncols), b1("b1", N, b_nrows, b_ncols); + const bool is_side_right = std::is_same::value; + const int b_nrows = is_side_right ? NumCols : BlkSize; + const int b_ncols = is_side_right ? BlkSize : NumCols; + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, b_nrows, b_ncols), + b1("b1", N, b_nrows, b_ncols); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -98,12 +92,9 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedSerialTrsm(alpha, a0, b0) - .run(); - Functor_TestBatchedSerialTrsm(alpha, a1, b1) + Functor_TestBatchedSerialTrsm(alpha, a0, b0) .run(); + Functor_TestBatchedSerialTrsm(alpha, a1, b1).run(); Kokkos::fence(); @@ -130,36 +121,27 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { } // namespace Trsm } // namespace Test -template +template int test_batched_trsm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::Trsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::Trsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Trsm::impl_test_batched_trsm(1024, i, 4); - Test::Trsm::impl_test_batched_trsm(1024, i, 1); + Test::Trsm::impl_test_batched_trsm(1024, i, 4); + Test::Trsm::impl_test_batched_trsm(1024, i, 1); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::Trsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Trsm::impl_test_batched_trsm(1024, i, 4); - Test::Trsm::impl_test_batched_trsm(1024, i, 1); + Test::Trsm::impl_test_batched_trsm(1024, i, 4); + Test::Trsm::impl_test_batched_trsm(1024, i, 1); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp index be0005a74c..d034ba1a53 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp @@ -16,28 +16,19 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_dcomplex ) // { @@ -47,45 +38,30 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { // test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_dcomplex ) // { @@ -96,28 +72,19 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { // } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_double ) { // typedef @@ -126,45 +93,30 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double) { // test_batched_trsm,double,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, param_tag_type, - algo_tag_type>(); + test_batched_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_double ) { // typedef diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp index 18b10a81e6..44cb802263 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp @@ -16,73 +16,53 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } @@ -90,84 +70,54 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double) { - typedef ::Test::Trmm::ParamTag - param_tag_type; + typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp index 512dce3bce..c0a7de9e99 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp @@ -22,7 +22,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trsv_Decl.hpp" -//#include "KokkosKernels_TestUtils.hpp" +// #include "KokkosKernels_TestUtils.hpp" using namespace KokkosBatched; @@ -36,8 +36,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrsv { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -45,8 +44,7 @@ struct Functor_TestBatchedSerialTrsv { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialTrsv(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedSerialTrsv(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} KOKKOS_INLINE_FUNCTION @@ -54,9 +52,8 @@ struct Functor_TestBatchedSerialTrsv { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0); - SerialTrsv::invoke(_alpha, aa, - bb); + SerialTrsv::invoke(_alpha, aa, bb); } inline void run() { @@ -71,8 +68,7 @@ struct Functor_TestBatchedSerialTrsv { } }; -template +template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -80,11 +76,10 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { /// randomized input testing views ScalarType alpha(1.5); - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1); + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, BlkSize, 1), + b1("b1", N, BlkSize, 1); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -95,12 +90,9 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedSerialTrsv(alpha, a0, b0) - .run(); - Functor_TestBatchedSerialTrsv(alpha, a1, b1) + Functor_TestBatchedSerialTrsv(alpha, a0, b0) .run(); + Functor_TestBatchedSerialTrsv(alpha, a1, b1).run(); Kokkos::fence(); @@ -120,16 +112,14 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { /// check b0 and b1 are correct const value_type one(1); - const bool is_unit_diag = - std::is_same::value; + const bool is_unit_diag = std::is_same::value; for (int k = 0; k < N; ++k) { if (std::is_same::value) { if (std::is_same::value) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = 0; j <= i; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b0_host(k, j, 0); tmp += aval * bval; } @@ -138,20 +128,17 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = 0; j <= i; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b1_host(k, j, 0); tmp += aval * bval; } EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps); } - } else if (std::is_same::value) { + } else if (std::is_same::value) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = i; j < BlkSize; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b0_host(k, j, 0); tmp += aval * bval; } @@ -160,8 +147,7 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { for (int i = 0; i < BlkSize; ++i) { value_type tmp(0); for (int j = i; j < BlkSize; ++j) { - const value_type aval = - (i == j && is_unit_diag ? one : a0_host(k, i, j)); + const value_type aval = (i == j && is_unit_diag ? one : a0_host(k, i, j)); const value_type bval = b1_host(k, j, 0); tmp += aval * bval; } @@ -183,15 +169,12 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { } // namespace Trsv } // namespace Test -template +template int test_batched_trsv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::Trsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::Trsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d, Uplo %d, Trans %d, Diag // %d\n", @@ -200,17 +183,14 @@ int test_batched_trsv() { // std::is_same::value, std::is_same::value); - Test::Trsv::impl_test_batched_trsv(1, i); + Test::Trsv::impl_test_batched_trsv(1, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::Trsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d, Uplo %d, Trans %d, Diag // %d\n", @@ -219,8 +199,7 @@ int test_batched_trsv() { // std::is_same::value, std::is_same::value); - Test::Trsv::impl_test_batched_trsv(1, i); + Test::Trsv::impl_test_batched_trsv(1, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp index a524b9f97e..73f0e65ed9 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp @@ -16,60 +16,44 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, param_tag_type, - algo_tag_type>(); + test_batched_trsv, double, param_tag_type, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp index be1bf77b9e..5998232605 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp @@ -16,26 +16,22 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; test_batched_trsv(); } @@ -43,31 +39,23 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_double_double) { - typedef ::Test::Trsv::ParamTag - param_tag_type; + typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp index b09cadcb7e..c4acbbfafb 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp @@ -51,8 +51,7 @@ struct NonUnitDiagTRTRI { KOKKOS_INLINE_FUNCTION void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -69,12 +68,9 @@ struct VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -110,8 +106,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedSerialTrtri { using execution_space = typename DeviceType::execution_space; ViewType _a; @@ -123,8 +118,7 @@ struct Functor_TestBatchedSerialTrtri { void operator()(const ParamTagType&, const int k) const { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(aa); + SerialTrtri::invoke(aa); } inline void run() { @@ -139,8 +133,7 @@ struct Functor_TestBatchedSerialTrtri { } }; -template +template void impl_test_batched_trtri(const int N, const int K) { typedef typename ViewType::value_type value_type; typedef typename DeviceType::execution_space execution_space; @@ -155,8 +148,7 @@ void impl_test_batched_trtri(const int N, const int K) { bool fail_flag = false; ScalarType cur_check_val; // Either 1 or 0, to check A_I - const bool is_A_lower = - std::is_same::value; + const bool is_A_lower = std::is_same::value; ViewType A("A", N, K, K); ViewType A_original("A_original", N, K, K); ViewType A_I("A_I", N, K, K); @@ -164,39 +156,29 @@ void impl_test_batched_trtri(const int N, const int K) { typename ViewType::HostMirror I_host = Kokkos::create_mirror_view(A_I); typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); - using ViewTypeSubA = - decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); + using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); Kokkos::Random_XorShift64_Pool rand_pool(seed); if (std::is_same::value) { // Initialize A with deterministic random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = UnitDiagTRTRI; for (int k = 0; k < N; ++k) { functor_type udtrtri(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", - Kokkos::RangePolicy(0, K), udtrtri); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", Kokkos::RangePolicy(0, K), udtrtri); } } else { //(diag[0]=='N')||(diag[0]=='n') // Initialize A with random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarType>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarType>::max()); using functor_type = NonUnitDiagTRTRI; for (int k = 0; k < N; ++k) { - functor_type nudtrtri( - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); + functor_type nudtrtri(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL())); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", - Kokkos::RangePolicy(0, K), - nudtrtri); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy(0, K), nudtrtri); } } Kokkos::fence(); @@ -241,9 +223,7 @@ void impl_test_batched_trtri(const int N, const int K) { } #endif - Functor_TestBatchedSerialTrtri(A) - .run(); + Functor_TestBatchedSerialTrtri(A).run(); #if PRINT_MAT printf("A_original:\n"); @@ -271,8 +251,7 @@ void impl_test_batched_trtri(const int N, const int K) { Kokkos::fence(); - struct VanillaGEMM - vgemm; + struct VanillaGEMM vgemm; vgemm.A_t = false; vgemm.B_t = false; vgemm.A_c = false; @@ -287,9 +266,8 @@ void impl_test_batched_trtri(const int N, const int K) { ; vgemm.C = Kokkos::subview(A_I, i, Kokkos::ALL(), Kokkos::ALL()); ; - Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", - Kokkos::TeamPolicy(K, Kokkos::AUTO, 16), vgemm); + Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy(K, Kokkos::AUTO, 16), + vgemm); } Kokkos::fence(); @@ -311,8 +289,7 @@ void impl_test_batched_trtri(const int N, const int K) { for (int k = 0; k < N; ++k) { for (int i = 0; i < K; ++i) { for (int j = 0; j < K; ++j) { - cur_check_val = - (i == j) ? ScalarType(1) : ScalarType(0); // ats::abs(host_A(i,j)); + cur_check_val = (i == j) ? ScalarType(1) : ScalarType(0); // ats::abs(host_A(i,j)); if (ats::abs(ats::abs(I_host(k, i, j)) - cur_check_val) > eps) { fail_flag = true; // printf(" Error: eps ( %g ), I_host ( %.15f ) != cur_check_val @@ -329,41 +306,29 @@ void impl_test_batched_trtri(const int N, const int K) { } // namespace Trtri } // namespace Test -template +template int test_batched_trtri(int batchSize = 512) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - Test::Trtri::impl_test_batched_trtri(0, 10); + Test::Trtri::impl_test_batched_trtri(0, 10); // Test::impl_test_batched_trtri( // 1, 2); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::Trtri::impl_test_batched_trtri(0, 10); + typedef Kokkos::View ViewType; + Test::Trtri::impl_test_batched_trtri(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); - Test::Trtri::impl_test_batched_trtri(batchSize, - i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); + Test::Trtri::impl_test_batched_trtri(batchSize, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp index 0d8f2c72a6..ca5575c99f 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp @@ -20,29 +20,25 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, Kokkos::complex, - param_tag_type, algo_tag_type>(128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif @@ -52,32 +48,24 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>(128); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp index 952994d207..66fcd162ab 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp @@ -48,28 +48,24 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp index b43b498607..d33f833146 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp @@ -37,8 +37,7 @@ struct Functor_TestBatchedTeamAxpy { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamAxpy(const alphaViewType &alpha, const ViewType &X, - const ViewType &Y, const int N_team) + Functor_TestBatchedTeamAxpy(const alphaViewType &alpha, const ViewType &X, const ViewType &Y, const int N_team) : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {} template @@ -46,16 +45,12 @@ struct Functor_TestBatchedTeamAxpy { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _X.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); KokkosBatched::TeamAxpy::invoke(member, alpha, x, y); } @@ -66,8 +61,7 @@ struct Functor_TestBatchedTeamAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -80,13 +74,11 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename alphaViewType::const_value_type alpha_const_value_type; typedef Kokkos::ArithTraits ats; - ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), - Y1("y1", N, BlkSize); + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); alphaViewType alpha("alpha", N); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X0, random, const_value_type(1.0)); Kokkos::fill_random(Y0, random, const_value_type(1.0)); Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0)); @@ -106,12 +98,9 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(Y0_host, Y0); for (int l = 0; l < N; ++l) - for (int i = 0; i < BlkSize; ++i) - Y0_host(l, i) += alpha_host(l) * X0_host(l, i); + for (int i = 0; i < BlkSize; ++i) Y0_host(l, i) += alpha_host(l) * X0_host(l, i); - Functor_TestBatchedTeamAxpy(alpha, X1, - Y1, N_team) - .run(); + Functor_TestBatchedTeamAxpy(alpha, X1, Y1, N_team).run(); Kokkos::fence(); @@ -140,25 +129,20 @@ int test_batched_team_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp index b95b769fcc..ba47fe739a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp @@ -16,8 +16,7 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_dcomplex) { - test_batched_team_axpy, - Kokkos::complex>(); + test_batched_team_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_double) { diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp index ac458d4a55..1fcbae03d6 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp @@ -15,9 +15,7 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_axpy_nt_float_float) { - test_batched_team_axpy(); -} +TEST_F(TestCategory, batched_scalar_team_axpy_nt_float_float) { test_batched_team_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp index 2d952889c9..f283da2b68 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" @@ -38,8 +38,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_TestBatchedTeamGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -47,24 +46,20 @@ struct Functor_TestBatchedTeamGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_TestBatchedTeamGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, - cc); + KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -74,19 +69,15 @@ struct Functor_TestBatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template -void impl_test_batched_teamgemm(const int N, const int matAdim1, - const int matAdim2, const int matBdim1, - const int matBdim2, const int matCdim1, - const int matCdim2) { +template +void impl_test_batched_teamgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, + const int matBdim2, const int matCdim1, const int matCdim2) { using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; @@ -96,15 +87,11 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); - ViewType a_expected("a_expected", N, matAdim1, matAdim2), - a_actual("a_actual", N, matAdim1, matAdim2), - b_expected("b_expected", N, matBdim1, matBdim2), - b_actual("b_actual", N, matBdim1, matBdim2), - c_expected("c_expected", N, matCdim1, matCdim2), - c_actual("c_actual", N, matCdim1, matCdim2); + ViewType a_expected("a_expected", N, matAdim1, matAdim2), a_actual("a_actual", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b_actual("b_actual", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c_actual("c_actual", N, matCdim1, matCdim2); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a_expected, random, value_type(1.0)); Kokkos::fill_random(b_expected, random, value_type(1.0)); @@ -116,8 +103,7 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, Kokkos::deep_copy(b_actual, b_expected); Kokkos::deep_copy(c_actual, c_expected); - Functor_BatchedVanillaGEMM - vgemm; + Functor_BatchedVanillaGEMM vgemm; vgemm.A_t = std::is_same::value; vgemm.B_t = std::is_same::value; vgemm.A_c = vgemm.B_c = false; @@ -128,17 +114,14 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, vgemm.beta = beta; vgemm.run(); // Compute c_expected - Functor_TestBatchedTeamGemm(alpha, a_actual, b_actual, beta, - c_actual) + Functor_TestBatchedTeamGemm(alpha, a_actual, b_actual, + beta, c_actual) .run(); Kokkos::fence(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host for comparision Kokkos::deep_copy(c_expected_host, c_expected); @@ -166,20 +149,16 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, // void (*impl_test)(const int, const int, const int, const int, const int, // const int, const int) -template +template int test_batched_teamgemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamGemm::impl_test_batched_teamgemm( + typedef Kokkos::View ViewType; + Test::TeamGemm::impl_test_batched_teamgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + Test::TeamGemm::impl_test_batched_teamgemm( 1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { @@ -187,36 +166,24 @@ int test_batched_teamgemm() { int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } @@ -224,15 +191,12 @@ int test_batched_teamgemm() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamGemm::impl_test_batched_teamgemm( + typedef Kokkos::View ViewType; + Test::TeamGemm::impl_test_batched_teamgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + Test::TeamGemm::impl_test_batched_teamgemm( 1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { @@ -240,36 +204,24 @@ int test_batched_teamgemm() { int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamGemm::impl_test_batched_teamgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamGemm::impl_test_batched_teamgemm( 1024, dimK, dimM, dimN, dimK, dimM, dimN); } } diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp index 09c7f3f2cc..a353513967 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp @@ -19,36 +19,24 @@ /// dcomplex, dcomplex TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::TeamGemm::ParamTag @@ -64,32 +52,24 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex) { /// dcomplex, double TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, - param_tag_type, algo_tag_type>(); + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::TeamGemm::ParamTag diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp index b1a5135018..6f06638c2a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp @@ -15,156 +15,116 @@ //@HEADER #if defined(KOKKOS_BHALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_bhalf_bhalf) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT #if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_half_half) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_float_float) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_double_double) { - typedef ::Test::TeamGemm::ParamTag - param_tag_type; + typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); + test_batched_teamgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp index dc3b4e53fb..d119308862 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp @@ -32,8 +32,7 @@ using namespace KokkosBatched; namespace Test { namespace TeamGesv { -template +template struct Functor_TestBatchedTeamGesv { using execution_space = typename DeviceType::execution_space; const MatrixType _A; @@ -41,16 +40,14 @@ struct Functor_TestBatchedTeamGesv { const VectorType _B; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamGesv(const MatrixType &A, const VectorType &X, - const VectorType &B) - : _A(A), _X(X), _B(B) {} + Functor_TestBatchedTeamGesv(const MatrixType &A, const VectorType &X, const VectorType &B) : _A(A), _X(X), _B(B) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int matrix_id = static_cast(member.league_rank()); - auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); - auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); - auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); member.team_barrier(); KokkosBatched::TeamGesv::invoke(member, A, x, b); @@ -63,13 +60,10 @@ struct Functor_TestBatchedTeamGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), Kokkos::AUTO()); - using MatrixViewType = - Kokkos::View; + using MatrixViewType = Kokkos::View; const int n = _A.extent(1); size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); @@ -80,15 +74,13 @@ struct Functor_TestBatchedTeamGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::ArithTraits ats; using MagnitudeType = typename Kokkos::ArithTraits::mag_type; - using NormViewType = - Kokkos::View; + using NormViewType = Kokkos::View; NormViewType sqr_norm_j("sqr_norm_j", N); auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); @@ -109,23 +101,18 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamGesv( - A, X, B) - .run(); + Functor_TestBatchedTeamGesv(A, X, B).run(); Kokkos::fence(); Kokkos::deep_copy(X_host, X); for (int l = 0; l < N; ++l) - KokkosBlas::SerialGemv:: - invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(X_host, l, Kokkos::ALL), 1, - Kokkos::subview(B_host, l, Kokkos::ALL)); + KokkosBlas::SerialGemv::invoke( + -1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); - KokkosBatched::SerialDot::invoke(B_host, B_host, - sqr_norm_j_host); + KokkosBatched::SerialDot::invoke(B_host, B_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); @@ -138,27 +125,21 @@ template int test_batched_team_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamGesv::impl_test_batched_gesv(1024, i); + Test::TeamGesv::impl_test_batched_gesv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamGesv::impl_test_batched_gesv(1024, i); + Test::TeamGesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp index d0b04ea57c..6fd7241f0b 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv_Real.hpp @@ -15,8 +15,7 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_float) { - test_batched_team_gesv(); + test_batched_team_gesv(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { test_batched_team_gesv(); @@ -25,8 +24,7 @@ TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_double) { - test_batched_team_gesv(); + test_batched_team_gesv(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_double) { test_batched_team_gesv(); diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp index a62e655d02..36d0aae738 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Team_Impl.hpp" #include "KokkosBatched_InverseLU_Decl.hpp" -//#include "KokkosBatched_InverseLU_Team_Impl.hpp" +// #include "KokkosBatched_InverseLU_Team_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedTeamGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,14 +49,12 @@ struct Functor_BatchedTeamGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); @@ -69,10 +66,8 @@ struct Functor_BatchedTeamGemm { } member.team_barrier(); - KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, - cc); + KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -83,8 +78,7 @@ struct Functor_BatchedTeamGemm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -124,15 +118,13 @@ struct Functor_BatchedTeamLU { } }; -template +template struct Functor_TestBatchedTeamInverseLU { AViewType _a; WViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamInverseLU(const AViewType &a, const WViewType &w) - : _a(a), _w(w) {} + Functor_TestBatchedTeamInverseLU(const AViewType &a, const WViewType &w) : _a(a), _w(w) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -140,8 +132,7 @@ struct Functor_TestBatchedTeamInverseLU { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto ww = Kokkos::subview(_w, k, Kokkos::ALL()); - KokkosBatched::TeamInverseLU::invoke(member, aa, - ww); + KokkosBatched::TeamInverseLU::invoke(member, aa, ww); } inline void run() { @@ -158,8 +149,7 @@ struct Functor_TestBatchedTeamInverseLU { } }; -template +template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -170,8 +160,7 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { WViewType w("w", N, BlkSize * BlkSize); AViewType c0("c0", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); @@ -181,15 +170,12 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { Functor_BatchedTeamLU(a1).run(); - Functor_TestBatchedTeamInverseLU(a1, w) - .run(); + Functor_TestBatchedTeamInverseLU(a1, w).run(); value_type alpha = 1.0, beta = 0.0; typedef ParamTag param_tag_type; - Functor_BatchedTeamGemm(alpha, a0, a1, beta, c0) + Functor_BatchedTeamGemm(alpha, a0, a1, beta, c0) .run(); Kokkos::fence(); @@ -220,33 +206,21 @@ template int test_batched_team_inverselu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::TeamInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::TeamInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamInverseLU::impl_test_batched_inverselu( - 1024, i); + Test::TeamInverseLU::impl_test_batched_inverselu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - AViewType; - typedef Kokkos::View - WViewType; - Test::TeamInverseLU::impl_test_batched_inverselu( - 0, 10); + typedef Kokkos::View AViewType; + typedef Kokkos::View WViewType; + Test::TeamInverseLU::impl_test_batched_inverselu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamInverseLU::impl_test_batched_inverselu( - 1024, i); + Test::TeamInverseLU::impl_test_batched_inverselu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp index 7eb918beef..cf670f2fc9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_team_inverselu_dcomplex) { // printf("Batched team inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, - Algo::InverseLU::Unblocked>(); + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched team inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, - Algo::InverseLU::Blocked>(); + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamLU.hpp b/batched/dense/unit_test/Test_Batched_TeamLU.hpp index e20f3a7411..b662c4a365 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Serial_Impl.hpp" @@ -76,16 +76,14 @@ void impl_test_batched_lu(const int N, const int BlkSize) { /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fence(); Kokkos::deep_copy(a1, a0); - Functor_TestBatchedTeamLU(a0) - .run(); + Functor_TestBatchedTeamLU(a0).run(); Functor_TestBatchedTeamLU(a1).run(); Kokkos::fence(); @@ -117,27 +115,21 @@ template int test_batched_team_lu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamLU::impl_test_batched_lu(0, - 10); + typedef Kokkos::View ViewType; + Test::TeamLU::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamLU::impl_test_batched_lu( - 1024, i); + Test::TeamLU::impl_test_batched_lu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamLU::impl_test_batched_lu(0, - 10); + typedef Kokkos::View ViewType; + Test::TeamLU::impl_test_batched_lu(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamLU::impl_test_batched_lu( - 1024, i); + Test::TeamLU::impl_test_batched_lu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp index 445e10132f..61a11e6be7 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp @@ -19,14 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_LU_Decl.hpp" #include "KokkosBatched_LU_Team_Impl.hpp" #include "KokkosBatched_SolveLU_Decl.hpp" -//#include "KokkosBatched_SolveLU_Team_Impl.hpp" +// #include "KokkosBatched_SolveLU_Team_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -41,8 +41,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_BatchedTeamGemm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -50,14 +49,12 @@ struct Functor_BatchedTeamGemm { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); @@ -69,10 +66,8 @@ struct Functor_BatchedTeamGemm { } member.team_barrier(); - KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, - cc); + KokkosBatched::TeamGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -82,8 +77,7 @@ struct Functor_BatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -120,16 +114,14 @@ struct Functor_BatchedTeamLU { Kokkos::Profiling::popRegion(); } }; -template +template struct Functor_TestBatchedTeamSolveLU { using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamSolveLU(const ViewType &a, const ViewType &b) - : _a(a), _b(b) {} + Functor_TestBatchedTeamSolveLU(const ViewType &a, const ViewType &b) : _a(a), _b(b) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -137,8 +129,7 @@ struct Functor_TestBatchedTeamSolveLU { auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamSolveLU::invoke( - member, aa, bb); + KokkosBatched::TeamSolveLU::invoke(member, aa, bb); } inline void run() { @@ -168,8 +159,7 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { // ViewType a0_T("a0_T", N, BlkSize, BlkSize); // ViewType b_T ("b_T", N, BlkSize, 5 ); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(x0, random, value_type(1.0)); @@ -181,15 +171,11 @@ void impl_test_batched_solvelu(const int N, const int BlkSize) { value_type alpha = 1.0, beta = 0.0; typedef ParamTag param_tag_type; - Functor_BatchedTeamGemm(alpha, a0, x0, beta, b) - .run(); + Functor_BatchedTeamGemm(alpha, a0, x0, beta, b).run(); Functor_BatchedTeamLU(a1).run(); - Functor_TestBatchedTeamSolveLU(a1, b) - .run(); + Functor_TestBatchedTeamSolveLU(a1, b).run(); Kokkos::fence(); @@ -246,25 +232,19 @@ template int test_batched_team_solvelu() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); + Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); + typedef Kokkos::View ViewType; + Test::TeamSolveLU::impl_test_batched_solvelu(0, 10); for (int i = 0; i < 10; ++i) { - Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); + Test::TeamSolveLU::impl_test_batched_solvelu(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp index 865f58ef43..f904983509 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp @@ -18,11 +18,9 @@ TEST_F(TestCategory, batched_scalar_team_solvelu_dcomplex) { // printf("Batched team solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_team_solvelu, - Algo::SolveLU::Unblocked>(); + test_batched_team_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched team solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_team_solvelu, - Algo::SolveLU::Blocked>(); + test_batched_team_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp index 523bd02df4..5ae1e216d9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Trsm_Serial_Impl.hpp" @@ -40,8 +40,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedTeamTrsm { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -49,22 +48,20 @@ struct Functor_TestBatchedTeamTrsm { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamTrsm(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedTeamTrsm(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamTrsm< - MemberType, typename ParamTagType::side, typename ParamTagType::uplo, - typename ParamTagType::trans, typename ParamTagType::diag, - AlgoTagType>::invoke(member, _alpha, aa, bb); + KokkosBatched::TeamTrsm::invoke(member, + _alpha, aa, + bb); } inline void run() { @@ -75,15 +72,13 @@ struct Functor_TestBatchedTeamTrsm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -91,15 +86,13 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { /// randomized input testing views ScalarType alpha(1.0); - const bool is_side_right = - std::is_same::value; - const int b_nrows = is_side_right ? NumCols : BlkSize; - const int b_ncols = is_side_right ? BlkSize : NumCols; - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, b_nrows, b_ncols), b1("b1", N, b_nrows, b_ncols); + const bool is_side_right = std::is_same::value; + const int b_nrows = is_side_right ? NumCols : BlkSize; + const int b_ncols = is_side_right ? BlkSize : NumCols; + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, b_nrows, b_ncols), + b1("b1", N, b_nrows, b_ncols); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -108,12 +101,9 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedTeamTrsm(alpha, a0, b0) - .run(); - Functor_TestBatchedTeamTrsm(alpha, a1, b1) + Functor_TestBatchedTeamTrsm(alpha, a0, b0) .run(); + Functor_TestBatchedTeamTrsm(alpha, a1, b1).run(); Kokkos::fence(); @@ -140,40 +130,27 @@ void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { } // namespace TeamTrsm } // namespace Test -template +template int test_batched_team_trsm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 4); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 1); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 4); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 1); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); + typedef Kokkos::View ViewType; + Test::TeamTrsm::impl_test_batched_trsm(0, 10, 4); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 4); - Test::TeamTrsm::impl_test_batched_trsm(1024, i, - 1); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 4); + Test::TeamTrsm::impl_test_batched_trsm(1024, i, 1); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp index 0cf2761922..cf9cafeb9e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp @@ -16,176 +16,106 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_dcomplex) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, - Kokkos::complex, param_tag_type, - algo_tag_type>(); + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp index 6757617ddd..cd1d2a7211 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp @@ -16,168 +16,108 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } // TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double) { - typedef ::Test::TeamTrsm::ParamTag - param_tag_type; + typedef ::Test::TeamTrsm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); + test_batched_team_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp index 400e35deb8..37e8708bd2 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_Trsv_Serial_Impl.hpp" @@ -38,8 +38,7 @@ struct ParamTag { typedef D diag; }; -template +template struct Functor_TestBatchedTeamTrsv { using execution_space = typename DeviceType::execution_space; ViewType _a, _b; @@ -47,22 +46,18 @@ struct Functor_TestBatchedTeamTrsv { ScalarType _alpha; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamTrsv(const ScalarType alpha, const ViewType &a, - const ViewType &b) + Functor_TestBatchedTeamTrsv(const ScalarType alpha, const ViewType &a, const ViewType &b) : _a(a), _b(b), _alpha(alpha) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0); - KokkosBatched::TeamTrsv< - MemberType, typename ParamTagType::uplo, typename ParamTagType::trans, - typename ParamTagType::diag, AlgoTagType>::invoke(member, _alpha, aa, - bb); + KokkosBatched::TeamTrsv::invoke(member, _alpha, aa, bb); } inline void run() { @@ -73,15 +68,13 @@ struct Functor_TestBatchedTeamTrsv { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -89,11 +82,10 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { /// randomized input testing views ScalarType alpha(1.5); - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1); + ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), b0("b0", N, BlkSize, 1), + b1("b1", N, BlkSize, 1); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a0, random, value_type(1.0)); Kokkos::fill_random(b0, random, value_type(1.0)); @@ -104,12 +96,9 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { Kokkos::deep_copy(a1, a0); Kokkos::deep_copy(b1, b0); - Functor_TestBatchedTeamTrsv(alpha, a0, b0) - .run(); - Functor_TestBatchedTeamTrsv(alpha, a1, b1) + Functor_TestBatchedTeamTrsv(alpha, a0, b0) .run(); + Functor_TestBatchedTeamTrsv(alpha, a1, b1).run(); Kokkos::fence(); @@ -136,34 +125,25 @@ void impl_test_batched_trsv(const int N, const int BlkSize) { } // namespace TeamTrsv } // namespace Test -template +template int test_batched_team_trsv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::TeamTrsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamTrsv::impl_test_batched_trsv(1024, - i); + Test::TeamTrsv::impl_test_batched_trsv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamTrsv::impl_test_batched_trsv(0, 10); + typedef Kokkos::View ViewType; + Test::TeamTrsv::impl_test_batched_trsv(0, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamTrsv::impl_test_batched_trsv(1024, - i); + Test::TeamTrsv::impl_test_batched_trsv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp index fca0534b4b..cd378745ef 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp @@ -37,9 +37,7 @@ struct Functor_TestBatchedTeamVectorAxpy { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorAxpy(const alphaViewType &alpha, - const ViewType &X, const ViewType &Y, - const int N_team) + Functor_TestBatchedTeamVectorAxpy(const alphaViewType &alpha, const ViewType &X, const ViewType &Y, const int N_team) : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {} template @@ -47,16 +45,12 @@ struct Functor_TestBatchedTeamVectorAxpy { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _X.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); KokkosBatched::TeamVectorAxpy::invoke(member, alpha, x, y); } @@ -67,8 +61,7 @@ struct Functor_TestBatchedTeamVectorAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -81,13 +74,11 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename alphaViewType::const_value_type alpha_const_value_type; typedef Kokkos::ArithTraits ats; - ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), - Y1("y1", N, BlkSize); + ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); alphaViewType alpha("alpha", N); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X0, random, const_value_type(1.0)); Kokkos::fill_random(Y0, random, const_value_type(1.0)); Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0)); @@ -107,12 +98,9 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(Y0_host, Y0); for (int l = 0; l < N; ++l) - for (int i = 0; i < BlkSize; ++i) - Y0_host(l, i) += alpha_host(l) * X0_host(l, i); + for (int i = 0; i < BlkSize; ++i) Y0_host(l, i) += alpha_host(l) * X0_host(l, i); - Functor_TestBatchedTeamVectorAxpy( - alpha, X1, Y1, N_team) - .run(); + Functor_TestBatchedTeamVectorAxpy(alpha, X1, Y1, N_team).run(); Kokkos::fence(); @@ -141,25 +129,20 @@ int test_batched_teamvector_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View ViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); + Test::TeamVectorAxpy::impl_test_batched_axpy(1024, i, 2); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp index b1f70a723e..0e8cb013f1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp @@ -16,8 +16,7 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_dcomplex) { - test_batched_teamvector_axpy, - Kokkos::complex>(); + test_batched_teamvector_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_double) { diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp index f2f3bc217d..2ebc10f2e0 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp @@ -33,8 +33,7 @@ struct ParamTag { typedef TB transB; }; -template +template struct Functor_TestBatchedTeamVector { using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; @@ -42,24 +41,20 @@ struct Functor_TestBatchedTeamVector { ScalarType _alpha, _beta; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVector(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, + Functor_TestBatchedTeamVector(const ScalarType alpha, const ViewType &a, const ViewType &b, const ScalarType beta, const ViewType &c) : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int k = member.league_rank(); auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - KokkosBatched::TeamVectorGemm::invoke(member, _alpha, aa, bb, - _beta, cc); + KokkosBatched::TeamVectorGemm::invoke(member, _alpha, aa, bb, _beta, cc); } inline void run() { @@ -69,19 +64,15 @@ struct Functor_TestBatchedTeamVector { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template -void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, - const int matAdim2, const int matBdim1, - const int matBdim2, const int matCdim1, - const int matCdim2) { +template +void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, + const int matBdim2, const int matCdim1, const int matCdim2) { using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; @@ -91,15 +82,11 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); - ViewType a_expected("a_expected", N, matAdim1, matAdim2), - a_actual("a_actual", N, matAdim1, matAdim2), - b_expected("b_expected", N, matBdim1, matBdim2), - b_actual("b_actual", N, matBdim1, matBdim2), - c_expected("c_expected", N, matCdim1, matCdim2), - c_actual("c_actual", N, matCdim1, matCdim2); + ViewType a_expected("a_expected", N, matAdim1, matAdim2), a_actual("a_actual", N, matAdim1, matAdim2), + b_expected("b_expected", N, matBdim1, matBdim2), b_actual("b_actual", N, matBdim1, matBdim2), + c_expected("c_expected", N, matCdim1, matCdim2), c_actual("c_actual", N, matCdim1, matCdim2); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a_expected, random, value_type(1.0)); Kokkos::fill_random(b_expected, random, value_type(1.0)); @@ -114,8 +101,7 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, // Functor_TestBatchedTeamVector(alpha, a_expected, b_expected, // beta, c_expected).run(); - Functor_BatchedVanillaGEMM - vgemm; + Functor_BatchedVanillaGEMM vgemm; vgemm.A_t = std::is_same::value; vgemm.B_t = std::is_same::value; vgemm.A_c = vgemm.B_c = false; @@ -126,17 +112,14 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, vgemm.beta = beta; vgemm.run(); // Compute c_expected - Functor_TestBatchedTeamVector(alpha, a_actual, b_actual, beta, - c_actual) + Functor_TestBatchedTeamVector(alpha, a_actual, b_actual, + beta, c_actual) .run(); Kokkos::fence(); - typename ViewType::HostMirror c_expected_host = - Kokkos::create_mirror_view(c_expected); - typename ViewType::HostMirror c_actual_host = - Kokkos::create_mirror_view(c_actual); + typename ViewType::HostMirror c_expected_host = Kokkos::create_mirror_view(c_expected); + typename ViewType::HostMirror c_actual_host = Kokkos::create_mirror_view(c_actual); // Copy to host for comparison Kokkos::deep_copy(c_expected_host, c_expected); @@ -165,111 +148,80 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, // void (*impl_test)(const int, const int, const int, const int, const int, // const int, const int) -template +template int test_batched_teamvectorgemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + typedef Kokkos::View ViewType; + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, i, i, i, i, i, i); + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( + typedef Kokkos::View ViewType; + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm( 0, 10, 10, 10, 10, 10, 10); for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, i, i, i, i, i, i); + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, i, i, i, i, i, i); } for (int i = 0; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); int dimM = i; int dimN = 2 * i; int dimK = 3 * i; - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimM, dimK, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimM, dimK, dimN, dimK, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimK, dimN, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimK, dimN, dimM, dimN); } - if ((std::is_same::value) && - (std::is_same::value)) { - Test::TeamVectorGemm::impl_test_batched_teamvectorgemm< - DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>( - 1024, dimK, dimM, dimN, dimK, dimM, dimN); + if ((std::is_same::value) && + (std::is_same::value)) { + Test::TeamVectorGemm::impl_test_batched_teamvectorgemm(1024, dimK, dimM, dimN, dimK, dimM, dimN); } } } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp index cc6cbdd511..3d8bd949da 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp @@ -15,80 +15,62 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, - Kokkos::complex, param_tag_type, + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp index e96bc1ac5c..74a32c13e9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp @@ -15,152 +15,116 @@ //@HEADER #if defined(KOKKOS_BHALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_bhalf_bhalf) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT #if defined(KOKKOS_HALF_T_IS_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_half_half) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_float_float) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_double_double) { - typedef ::Test::TeamVectorGemm::ParamTag - param_tag_type; + typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp index ddb1a5c40d..dba452da53 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp @@ -32,8 +32,7 @@ using namespace KokkosBatched; namespace Test { namespace TeamVectorGesv { -template +template struct Functor_TestBatchedTeamVectorGesv { using execution_space = typename DeviceType::execution_space; const MatrixType _A; @@ -41,20 +40,18 @@ struct Functor_TestBatchedTeamVectorGesv { const VectorType _B; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGesv(const MatrixType &A, const VectorType &X, - const VectorType &B) + Functor_TestBatchedTeamVectorGesv(const MatrixType &A, const VectorType &X, const VectorType &B) : _A(A), _X(X), _B(B) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int matrix_id = static_cast(member.league_rank()); - auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); - auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); - auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); member.team_barrier(); - KokkosBatched::TeamVectorGesv::invoke(member, A, x, - b); + KokkosBatched::TeamVectorGesv::invoke(member, A, x, b); member.team_barrier(); } @@ -64,13 +61,10 @@ struct Functor_TestBatchedTeamVectorGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), Kokkos::AUTO()); - using MatrixViewType = - Kokkos::View; + using MatrixViewType = Kokkos::View; const int n = _A.extent(1); size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); @@ -81,15 +75,13 @@ struct Functor_TestBatchedTeamVectorGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::ArithTraits ats; using MagnitudeType = typename Kokkos::ArithTraits::mag_type; - using NormViewType = - Kokkos::View; + using NormViewType = Kokkos::View; NormViewType sqr_norm_j("sqr_norm_j", N); auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); @@ -110,23 +102,18 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorGesv(A, X, B) - .run(); + Functor_TestBatchedTeamVectorGesv(A, X, B).run(); Kokkos::fence(); Kokkos::deep_copy(X_host, X); for (int l = 0; l < N; ++l) - KokkosBlas::SerialGemv:: - invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), - Kokkos::subview(X_host, l, Kokkos::ALL), 1, - Kokkos::subview(B_host, l, Kokkos::ALL)); + KokkosBlas::SerialGemv::invoke( + -1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); - KokkosBatched::SerialDot::invoke(B_host, B_host, - sqr_norm_j_host); + KokkosBatched::SerialDot::invoke(B_host, B_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); @@ -139,29 +126,21 @@ template int test_batched_teamvector_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGesv::impl_test_batched_gesv( - 1024, i); + Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixType; - typedef Kokkos::View - VectorType; + typedef Kokkos::View MatrixType; + typedef Kokkos::View VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGesv::impl_test_batched_gesv( - 1024, i); + Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp index 66c6fb3691..73a6281fe5 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv_Real.hpp @@ -15,22 +15,18 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_float) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_float) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_double) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_double) { - test_batched_teamvector_gesv(); + test_batched_teamvector_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp index 84ccb39611..2f4812179a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorQR { using execution_space = typename DeviceType::execution_space; MatrixViewType _a; @@ -41,11 +41,8 @@ struct Functor_TestBatchedTeamVectorQR { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorQR(const MatrixViewType &a, - const VectorViewType &x, - const VectorViewType &b, - const VectorViewType &t, - const WorkViewType &w) + Functor_TestBatchedTeamVectorQR(const MatrixViewType &a, const VectorViewType &x, const VectorViewType &b, + const VectorViewType &t, const WorkViewType &w) : _a(a), _x(x), _b(b), _t(t), _w(w) {} template @@ -61,17 +58,15 @@ struct Functor_TestBatchedTeamVectorQR { auto ww = Kokkos::subview(_w, k, Kokkos::ALL()); // make diagonal dominant - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, aa.extent(0)), - [&](const int &i) { aa(i, i) += add_this; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, aa.extent(0)), [&](const int &i) { aa(i, i) += add_this; }); /// xx = 1 KokkosBlas::TeamVectorSet::invoke(member, one, xx); member.team_barrier(); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// AA = QR @@ -83,13 +78,12 @@ struct Functor_TestBatchedTeamVectorQR { member.team_barrier(); /// xx = Q^{T}xx; - TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); + TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); member.team_barrier(); /// xx = R^{-1} xx - TeamVectorTrsv::invoke(member, one, aa, xx); + TeamVectorTrsv::invoke( + member, one, aa, xx); } inline void run() { @@ -107,8 +101,8 @@ struct Functor_TestBatchedTeamVectorQR { } }; -template +template void impl_test_batched_qr(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -122,14 +116,12 @@ void impl_test_batched_qr(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); - Functor_TestBatchedTeamVectorQR(a, x, b, t, w) + Functor_TestBatchedTeamVectorQR(a, x, b, t, w) .run(); Kokkos::fence(); @@ -157,35 +149,25 @@ template int test_batched_qr() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_qr(1024, i); + Test::impl_test_batched_qr(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_qr(1024, i); + Test::impl_test_batched_qr(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 09427aa25e..f66cebe07d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { using execution_space = typename DeviceType::execution_space; MatrixViewType _a; @@ -42,9 +42,9 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorQR_WithColumnPivoting( - const MatrixViewType &a, const VectorViewType &x, const VectorViewType &b, - const VectorViewType &t, const PivotViewType &p, const WorkViewType &w) + Functor_TestBatchedTeamVectorQR_WithColumnPivoting(const MatrixViewType &a, const VectorViewType &x, + const VectorViewType &b, const VectorViewType &t, + const PivotViewType &p, const WorkViewType &w) : _a(a), _x(x), _b(b), _t(t), _p(p), _w(w) {} template @@ -69,15 +69,13 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { member.team_barrier(); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// AA P^T = QR int matrix_rank(0); - TeamVectorQR_WithColumnPivoting::invoke( - member, aa, tt, pp, ww, matrix_rank); + TeamVectorQR_WithColumnPivoting::invoke(member, aa, tt, pp, ww, matrix_rank); member.team_barrier(); /// xx = bb; @@ -85,25 +83,22 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { member.team_barrier(); /// xx = Q^{T} xx; - TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); + TeamVectorApplyQ::invoke(member, aa, tt, xx, ww); member.team_barrier(); /// xx = R^{-1} xx - TeamVectorTrsv::invoke(member, one, aa, xx); + TeamVectorTrsv::invoke( + member, one, aa, xx); member.team_barrier(); /// xx = P xx - TeamVectorApplyPivot::invoke( - member, pp, xx); + TeamVectorApplyPivot::invoke(member, pp, xx); member.team_barrier(); } inline void run() { typedef typename MatrixViewType::non_const_value_type value_type; - std::string name_region( - "KokkosBatched::Test::TeamVectorQR_WithColumnPivoting"); + std::string name_region("KokkosBatched::Test::TeamVectorQR_WithColumnPivoting"); const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); @@ -116,8 +111,8 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { } }; -template +template void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -132,15 +127,13 @@ void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); - Functor_TestBatchedTeamVectorQR_WithColumnPivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType, - AlgoTagType>(a, x, b, t, p, w) + Functor_TestBatchedTeamVectorQR_WithColumnPivoting(a, x, b, t, p, w) .run(); Kokkos::fence(); @@ -164,48 +157,35 @@ void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_qr_with_columnpivoting() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivotViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType, - AlgoTagType>(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivotViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr_with_columnpivoting(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, - WorkViewType, AlgoTagType>(1024, i); + Test::impl_test_batched_qr_with_columnpivoting(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivotViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType, - AlgoTagType>(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivotViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_qr_with_columnpivoting(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_qr_with_columnpivoting< - DeviceType, MatrixViewType, VectorViewType, PivotViewType, - WorkViewType, AlgoTagType>(1024, i); + Test::impl_test_batched_qr_with_columnpivoting(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp index 2f30c7d3c1..fdf482b4ab 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorSolveUTV { using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; @@ -42,11 +42,9 @@ struct Functor_TestBatchedTeamVectorSolveUTV { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSolveUTV( - const MatrixViewType &r, const MatrixViewType &a, - const MatrixViewType &acopy, const MatrixViewType &u, - const MatrixViewType &v, const PivViewType &p, const VectorViewType &x, - const VectorViewType &b, const WorkViewType &w) + Functor_TestBatchedTeamVectorSolveUTV(const MatrixViewType &r, const MatrixViewType &a, const MatrixViewType &acopy, + const MatrixViewType &u, const MatrixViewType &v, const PivViewType &p, + const VectorViewType &x, const VectorViewType &b, const WorkViewType &w) : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} template @@ -72,22 +70,18 @@ struct Functor_TestBatchedTeamVectorSolveUTV { // make diagonal dominant and set xx = 1,2,3,4,5 const int m = aa.extent(0), r = rr.extent(1); if (m <= r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - aa(i, i) += add_this; - xx(i) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + aa(i, i) += add_this; + xx(i) = (i + 1); + }); } else { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), - [=](const int &ij) { - const int i = ij / m, j = ij % m; - value_type tmp(0); - for (int l = 0; l < r; ++l) - tmp += rr(i, l) * rr(j, l); - aa(i, j) = tmp; - }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { xx(i) = (i + 1); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), [=](const int &ij) { + const int i = ij / m, j = ij % m; + value_type tmp(0); + for (int l = 0; l < r; ++l) tmp += rr(i, l) * rr(j, l); + aa(i, j) = tmp; + }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { xx(i) = (i + 1); }); } member.team_barrier(); // finish writing aa, xx @@ -95,9 +89,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// Solving Ax = b using UTV transformation @@ -106,12 +99,10 @@ struct Functor_TestBatchedTeamVectorSolveUTV { /// UTV = A P^T int matrix_rank(0); - TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, - matrix_rank); + TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, matrix_rank); member.team_barrier(); - TeamVectorSolveUTV::invoke(member, matrix_rank, uu, - aa, vv, pp, xx, bb, ww); + TeamVectorSolveUTV::invoke(member, matrix_rank, uu, aa, vv, pp, xx, bb, ww); } inline void run() { @@ -129,8 +120,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV { } }; -template +template void impl_test_batched_solve_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -148,8 +139,7 @@ void impl_test_batched_solve_utv(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); if (BlkSize <= 3) Kokkos::fill_random(a, random, value_type(1.0)); else @@ -157,10 +147,8 @@ void impl_test_batched_solve_utv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorSolveUTV( - r, a, acopy, u, v, p, x, b, w) + Functor_TestBatchedTeamVectorSolveUTV(r, a, acopy, u, v, p, x, b, w) .run(); Kokkos::fence(); @@ -203,48 +191,35 @@ void impl_test_batched_solve_utv(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_solve_utv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_solve_utv(1024, i); + Test::impl_test_batched_solve_utv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_solve_utv(1024, i); + Test::impl_test_batched_solve_utv(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp index cf7084a92c..b38fb318e6 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp @@ -32,8 +32,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorSolveUTV2 { using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; @@ -42,11 +42,9 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSolveUTV2( - const MatrixViewType &r, const MatrixViewType &a, - const MatrixViewType &acopy, const MatrixViewType &u, - const MatrixViewType &v, const PivViewType &p, const VectorViewType &x, - const VectorViewType &b, const WorkViewType &w) + Functor_TestBatchedTeamVectorSolveUTV2(const MatrixViewType &r, const MatrixViewType &a, const MatrixViewType &acopy, + const MatrixViewType &u, const MatrixViewType &v, const PivViewType &p, + const VectorViewType &x, const VectorViewType &b, const WorkViewType &w) : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} template @@ -72,24 +70,20 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { // make diagonal dominant and set xx = 1,2,3,4,5 const int m = aa.extent(0), r = rr.extent(1); if (m <= r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - aa(i, i) += add_this; - for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + aa(i, i) += add_this; + for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); + }); } else { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), - [=](const int &ij) { - const int i = ij / m, j = ij % m; - value_type tmp(0); - for (int l = 0; l < r; ++l) - tmp += rr(i, l) * rr(j, l); - aa(i, j) = tmp; - }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), [=](const int &ij) { + const int i = ij / m, j = ij % m; + value_type tmp(0); + for (int l = 0; l < r; ++l) tmp += rr(i, l) * rr(j, l); + aa(i, j) = tmp; + }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1); + }); } member.team_barrier(); // finish writing aa, xx @@ -97,11 +91,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - KokkosBatched::TeamVectorGemm::invoke(member, one, - aa, xx, zero, - bb); + KokkosBatched::TeamVectorGemm::invoke( + member, one, aa, xx, zero, bb); member.team_barrier(); /// Solving Ax = b using UTV transformation @@ -110,12 +101,10 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { /// UTV = A P^T int matrix_rank(0); - TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, - matrix_rank); + TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, matrix_rank); member.team_barrier(); - TeamVectorSolveUTV::invoke(member, matrix_rank, uu, - aa, vv, pp, xx, bb, ww); + TeamVectorSolveUTV::invoke(member, matrix_rank, uu, aa, vv, pp, xx, bb, ww); } inline void run() { @@ -133,8 +122,8 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { } }; -template +template void impl_test_batched_solve_utv2(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -152,8 +141,7 @@ void impl_test_batched_solve_utv2(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); if (BlkSize <= 3) Kokkos::fill_random(a, random, value_type(1.0)); else @@ -161,10 +149,8 @@ void impl_test_batched_solve_utv2(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorSolveUTV2( - r, a, acopy, u, v, p, x, b, w) + Functor_TestBatchedTeamVectorSolveUTV2(r, a, acopy, u, v, p, x, b, w) .run(); Kokkos::fence(); @@ -210,48 +196,35 @@ void impl_test_batched_solve_utv2(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_solve_utv2() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv2(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv2(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_solve_utv2(1024, i); + Test::impl_test_batched_solve_utv2(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_solve_utv2(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_solve_utv2(0, 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_solve_utv2(1024, i); + Test::impl_test_batched_solve_utv2(1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp index eb45a70c89..44f6ec394a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp @@ -31,8 +31,8 @@ using namespace KokkosBatched; namespace Test { -template +template struct Functor_TestBatchedTeamVectorUTV { using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; @@ -41,11 +41,9 @@ struct Functor_TestBatchedTeamVectorUTV { WorkViewType _w; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorUTV( - const MatrixViewType &r, const MatrixViewType &a, - const MatrixViewType &acopy, const MatrixViewType &u, - const MatrixViewType &v, const PivViewType &p, const VectorViewType &x, - const VectorViewType &b, const WorkViewType &w) + Functor_TestBatchedTeamVectorUTV(const MatrixViewType &r, const MatrixViewType &a, const MatrixViewType &acopy, + const MatrixViewType &u, const MatrixViewType &v, const PivViewType &p, + const VectorViewType &x, const VectorViewType &b, const WorkViewType &w) : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} template @@ -71,22 +69,18 @@ struct Functor_TestBatchedTeamVectorUTV { // make diagonal dominant and set xx = 1,2,3,4,5 const int m = aa.extent(0), r = rr.extent(1); if (m <= r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { - aa(i, i) += add_this; - xx(i) = (i + 1); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { + aa(i, i) += add_this; + xx(i) = (i + 1); + }); } else { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), - [=](const int &ij) { - const int i = ij / m, j = ij % m; - value_type tmp(0); - for (int l = 0; l < r; ++l) - tmp += rr(i, l) * rr(j, l); - aa(i, j) = tmp; - }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { xx(i) = (i + 1); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m), [=](const int &ij) { + const int i = ij / m, j = ij % m; + value_type tmp(0); + for (int l = 0; l < r; ++l) tmp += rr(i, l) * rr(j, l); + aa(i, j) = tmp; + }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { xx(i) = (i + 1); }); } member.team_barrier(); // finish writing aa, xx @@ -94,9 +88,8 @@ struct Functor_TestBatchedTeamVectorUTV { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - KokkosBlas::TeamVectorGemv::invoke(member, one, aa, - xx, zero, bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, xx, zero, + bb); member.team_barrier(); /// Solving Ax = b using UTV transformation @@ -105,46 +98,41 @@ struct Functor_TestBatchedTeamVectorUTV { /// UTV = A P^T int matrix_rank(0); - TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, - matrix_rank); + TeamVectorUTV::invoke(member, aa, pp, uu, vv, ww, matrix_rank); member.team_barrier(); const auto range_upto_rank = Kokkos::pair(0, matrix_rank); - auto um = Kokkos::subview(uu, Kokkos::ALL(), range_upto_rank); - auto am = Kokkos::subview(aa, range_upto_rank, range_upto_rank); - auto vm = Kokkos::subview(vv, range_upto_rank, Kokkos::ALL()); + auto um = Kokkos::subview(uu, Kokkos::ALL(), range_upto_rank); + auto am = Kokkos::subview(aa, range_upto_rank, range_upto_rank); + auto vm = Kokkos::subview(vv, range_upto_rank, Kokkos::ALL()); if (matrix_rank < m) { /// w = U^T b - KokkosBlas::TeamVectorGemv::invoke(member, one, um, - bb, zero, ww); + KokkosBlas::TeamVectorGemv::invoke(member, one, um, bb, zero, + ww); member.team_barrier(); /// w = T^{-1} w - TeamVectorTrsv::invoke(member, one, am, ww); + TeamVectorTrsv::invoke( + member, one, am, ww); member.team_barrier(); /// x = V^T w - KokkosBlas::TeamVectorGemv::invoke(member, one, vm, - ww, zero, xx); + KokkosBlas::TeamVectorGemv::invoke(member, one, vm, ww, zero, + xx); member.team_barrier(); } else { /// x = U^T b - KokkosBlas::TeamVectorGemv::invoke(member, one, um, - bb, zero, xx); + KokkosBlas::TeamVectorGemv::invoke(member, one, um, bb, zero, + xx); member.team_barrier(); /// x = T^{-1} x - TeamVectorTrsv::invoke(member, one, am, xx); + TeamVectorTrsv::invoke( + member, one, am, xx); member.team_barrier(); } /// x = P^T x - TeamVectorApplyPivot::invoke( - member, pp, xx); + TeamVectorApplyPivot::invoke(member, pp, xx); member.team_barrier(); } @@ -163,8 +151,8 @@ struct Functor_TestBatchedTeamVectorUTV { } }; -template +template void impl_test_batched_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ats; @@ -182,8 +170,7 @@ void impl_test_batched_utv(const int N, const int BlkSize) { Kokkos::fence(); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); if (BlkSize <= 3) Kokkos::fill_random(a, random, value_type(1.0)); else @@ -191,8 +178,7 @@ void impl_test_batched_utv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorUTV( + Functor_TestBatchedTeamVectorUTV( r, a, acopy, u, v, p, x, b, w) .run(); @@ -236,46 +222,35 @@ void impl_test_batched_utv(const int N, const int BlkSize) { } } // namespace Test -template +template int test_batched_utv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_utv(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_utv(0, + 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::impl_test_batched_utv(1024, - i); + Test::impl_test_batched_utv( + 1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - MatrixViewType; - typedef Kokkos::View - VectorViewType; - typedef Kokkos::View - PivViewType; - typedef Kokkos::View - WorkViewType; - Test::impl_test_batched_utv(0, 10); + typedef Kokkos::View MatrixViewType; + typedef Kokkos::View VectorViewType; + typedef Kokkos::View PivViewType; + typedef Kokkos::View WorkViewType; + Test::impl_test_batched_utv(0, + 10); for (int i = 1; i < 10; ++i) { // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::impl_test_batched_utv(1024, - i); + Test::impl_test_batched_utv( + 1024, i); } } #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp index 9d1205717f..654d199117 100644 --- a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -95,132 +93,91 @@ void impl_test_batched_vector_arithmatic() { { /// test : vec + vec c = a + b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + b[k]), eps * ats::abs(c[k])); /// test : value + vec c = alpha + b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha + b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha + b[k]), eps * ats::abs(c[k])); /// test : vec + value c = b + alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] + alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] + alpha), eps * ats::abs(c[k])); /// test : vec + mag c = a + beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + beta), eps * ats::abs(c[k])); /// test : mag + vec c = beta + a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta + a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta + a[k]), eps * ats::abs(c[k])); } { /// test : vec - vec c = a - b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - b[k]), eps * ats::abs(c[k])); /// test : value - vec c = alpha - b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha - b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha - b[k]), eps * ats::abs(c[k])); /// test : vec + value c = b - alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] - alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] - alpha), eps * ats::abs(c[k])); /// test : vec - mag c = a - beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - beta), eps * ats::abs(c[k])); /// test : mag - vec c = beta - a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta - a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta - a[k]), eps * ats::abs(c[k])); } { /// test : vec * vec c = a * b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * b[k]), eps * ats::abs(c[k])); /// test : value * vec c = alpha * b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha * b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha * b[k]), eps * ats::abs(c[k])); /// test : vec + value c = b * alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] * alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] * alpha), eps * ats::abs(c[k])); /// test : vec * mag c = a * beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * beta), eps * ats::abs(c[k])); /// test : mag * vec c = beta * a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta * a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta * a[k]), eps * ats::abs(c[k])); } { /// test : vec / vec c = a / b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / b[k]), eps * ats::abs(c[k])); /// test : value / vec c = alpha / b; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha / b[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha / b[k]), eps * ats::abs(c[k])); /// test : vec / value c = b / alpha; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] / alpha), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] / alpha), eps * ats::abs(c[k])); /// test : mag / vec c = beta / a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta / a[k]), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta / a[k]), eps * ats::abs(c[k])); /// test : vec / value c = a / beta; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / beta), - eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / beta), eps * ats::abs(c[k])); } { /// test : vec -vec c = -a; - for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(-a[k]), eps * ats::abs(c[k])); + for (int k = 0; k < vector_length; ++k) EXPECT_NEAR(ats::abs(c[k]), ats::abs(-a[k]), eps * ats::abs(c[k])); } #if defined(__DO_NOT_TEST__) { @@ -232,8 +189,7 @@ void impl_test_batched_vector_arithmatic() { c += vector_type(tiny) * vector_type(a >= 0); for (int k = 0; k < vector_length; ++k) - EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] < 0 ? -tiny : tiny), - eps * ats::abs(c[k])); + EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] < 0 ? -tiny : tiny), eps * ats::abs(c[k])); } #endif } @@ -242,18 +198,16 @@ void impl_test_batched_vector_arithmatic() { template int test_batched_vector_arithmatic() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_arithmatic(); return 0; } template int test_batched_complex_real_imag_value() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_complex_real_imag_value(); return 0; @@ -297,65 +251,53 @@ TEST_F(TestCategory, batched_vector_arithmatic_simd_double8) { #define __DO_NOT_TEST__ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex3) { - test_batched_vector_arithmatic >, - 3>(); + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex4) { - test_batched_vector_arithmatic >, - 4>(); + test_batched_vector_arithmatic >, 4>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex8) { - test_batched_vector_arithmatic >, - 8>(); + test_batched_vector_arithmatic >, 8>(); } TEST_F(TestCategory, batched_vector_scomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); + test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_scomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); + test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_scomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); + test_batched_complex_real_imag_value >, 4>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex3) { - test_batched_vector_arithmatic >, - 3>(); + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex2) { - test_batched_vector_arithmatic >, - 2>(); + test_batched_vector_arithmatic >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex4) { - test_batched_vector_arithmatic >, - 4>(); + test_batched_vector_arithmatic >, 4>(); } TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); + test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); + test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); + test_batched_complex_real_imag_value >, 4>(); } #endif #undef __DO_NOT_TEST__ diff --git a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp index 5ab10bb5bd..0427982a42 100644 --- a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -59,33 +57,30 @@ void impl_test_batched_vector_logical() { { #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = a op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op b[i]); \ } CHECK(||); CHECK(&&); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op 0; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op 0); \ +#define CHECK(op) \ + { \ + const auto comparison = a op 0; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op 0); \ } CHECK(||); CHECK(&&); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = 0 op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], 0 op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = 0 op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], 0 op b[i]); \ } CHECK(||); @@ -100,9 +95,8 @@ void impl_test_batched_vector_logical() { template int test_batched_vector_logical() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_logical(); return 0; @@ -113,21 +107,13 @@ int test_batched_vector_logical() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_logical_simd_float3) { - test_batched_vector_logical(); -} -TEST_F(TestCategory, batched_vector_logical_simd_float8) { - test_batched_vector_logical(); -} +TEST_F(TestCategory, batched_vector_logical_simd_float3) { test_batched_vector_logical(); } +TEST_F(TestCategory, batched_vector_logical_simd_float8) { test_batched_vector_logical(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_logical_simd_double3) { - test_batched_vector_logical(); -} -TEST_F(TestCategory, batched_vector_logical_simd_double4) { - test_batched_vector_logical(); -} +TEST_F(TestCategory, batched_vector_logical_simd_double3) { test_batched_vector_logical(); } +TEST_F(TestCategory, batched_vector_logical_simd_double4) { test_batched_vector_logical(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) diff --git a/batched/dense/unit_test/Test_Batched_VectorMath.hpp b/batched/dense/unit_test/Test_Batched_VectorMath.hpp index 02c943d587..2cd9f02a49 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMath.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMath.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -67,11 +65,10 @@ void impl_test_batched_vector_math() { { #undef CHECK -#define CHECK(op) \ - { \ - a = op(aref); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::op(aref[i]), eps* a[i]); \ +#define CHECK(op) \ + { \ + a = op(aref); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::op(aref[i]), eps* a[i]); \ } CHECK(sqrt); @@ -89,32 +86,29 @@ void impl_test_batched_vector_math() { CHECK(atan); #undef CHECK -#define CHECK \ - { \ - a = pow(aref, bref); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::pow(aref[i], bref[i]), eps* a[i]); \ - } \ +#define CHECK \ + { \ + a = pow(aref, bref); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::pow(aref[i], bref[i]), eps* a[i]); \ + } \ CHECK; #undef CHECK -#define CHECK(op) \ - { \ - mag_type beta = mag_type(3.2); \ - a = op(aref, beta); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::op(aref[i], beta), eps* a[i]); \ +#define CHECK(op) \ + { \ + mag_type beta = mag_type(3.2); \ + a = op(aref, beta); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::op(aref[i], beta), eps* a[i]); \ } CHECK(pow); #undef CHECK -#define CHECK(op) \ - { \ - value_type alpha = random.value() + 2.0; \ - a = op(alpha, bref); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_NEAR_KK(a[i], ats::op(alpha, bref[i]), eps* a[i]); \ +#define CHECK(op) \ + { \ + value_type alpha = random.value() + 2.0; \ + a = op(alpha, bref); \ + for (int i = 0; i < vector_length; ++i) EXPECT_NEAR_KK(a[i], ats::op(alpha, bref[i]), eps* a[i]); \ } CHECK(pow); @@ -126,9 +120,8 @@ void impl_test_batched_vector_math() { template int test_batched_vector_math() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_math(); return 0; @@ -156,21 +149,13 @@ int test_batched_vector_math() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_math_simd_float3) { - test_batched_vector_math, 3>(); -} -TEST_F(TestCategory, batched_vector_math_simd_float8) { - test_batched_vector_math, 8>(); -} +TEST_F(TestCategory, batched_vector_math_simd_float3) { test_batched_vector_math, 3>(); } +TEST_F(TestCategory, batched_vector_math_simd_float8) { test_batched_vector_math, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_math_simd_double3) { - test_batched_vector_math, 3>(); -} -TEST_F(TestCategory, batched_vector_math_simd_double4) { - test_batched_vector_math, 4>(); -} +TEST_F(TestCategory, batched_vector_math_simd_double3) { test_batched_vector_math, 3>(); } +TEST_F(TestCategory, batched_vector_math_simd_double4) { test_batched_vector_math, 4>(); } #endif // using namespace Test; diff --git a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp index 5f176ccba8..98d7f4e87e 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -159,9 +157,8 @@ void impl_test_batched_vector_misc() { template int test_batched_vector_misc() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_misc(); return 0; @@ -172,21 +169,13 @@ int test_batched_vector_misc() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_misc_simd_float3) { - test_batched_vector_misc, 3>(); -} -TEST_F(TestCategory, batched_vector_misc_simd_float8) { - test_batched_vector_misc, 8>(); -} +TEST_F(TestCategory, batched_vector_misc_simd_float3) { test_batched_vector_misc, 3>(); } +TEST_F(TestCategory, batched_vector_misc_simd_float8) { test_batched_vector_misc, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_misc_simd_double3) { - test_batched_vector_misc, 3>(); -} -TEST_F(TestCategory, batched_vector_misc_simd_double4) { - test_batched_vector_misc, 4>(); -} +TEST_F(TestCategory, batched_vector_misc_simd_double3) { test_batched_vector_misc, 3>(); } +TEST_F(TestCategory, batched_vector_misc_simd_double4) { test_batched_vector_misc, 4>(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) diff --git a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp index 1aff1b2d0f..e5c3139c5c 100644 --- a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -60,11 +58,10 @@ void impl_test_batched_vector_relation() { { #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = a op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op b[i]); \ } CHECK(<); @@ -75,11 +72,10 @@ void impl_test_batched_vector_relation() { CHECK(!=); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = a op value_type(0); \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], a[i] op value_type(0)); \ +#define CHECK(op) \ + { \ + const auto comparison = a op value_type(0); \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], a[i] op value_type(0)); \ } CHECK(<); @@ -90,11 +86,10 @@ void impl_test_batched_vector_relation() { CHECK(!=); #undef CHECK -#define CHECK(op) \ - { \ - const auto comparison = value_type(0) op b; \ - for (int i = 0; i < vector_length; ++i) \ - EXPECT_EQ(comparison[i], value_type(0) op b[i]); \ +#define CHECK(op) \ + { \ + const auto comparison = value_type(0) op b; \ + for (int i = 0; i < vector_length; ++i) EXPECT_EQ(comparison[i], value_type(0) op b[i]); \ } CHECK(<); @@ -113,9 +108,8 @@ void impl_test_batched_vector_relation() { template int test_batched_vector_relation() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); Test::impl_test_batched_vector_relation(); return 0; diff --git a/batched/dense/unit_test/Test_Batched_VectorView.hpp b/batched/dense/unit_test/Test_Batched_VectorView.hpp index 74c7748cba..5d9047e57c 100644 --- a/batched/dense/unit_test/Test_Batched_VectorView.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorView.hpp @@ -21,10 +21,8 @@ // to ensure it is not included in these // backends unit-test -#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \ - !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ - !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \ - !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) +#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \ + !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) #include "gtest/gtest.h" #include "Kokkos_Core.hpp" @@ -62,100 +60,76 @@ void impl_init_vector_view(const VectorViewType& a) { for (int i7 = 0, i7end = b.extent(7); i7 < i7end; ++i7) template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0 / vl, i1, i2, i3, i4, i5, i6, i7)[i0 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0 / vl, i1, i2, i3, i4, i5, i6, i7)[i0 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1 / vl, i2, i3, i4, i5, i6, i7)[i1 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1 / vl, i2, i3, i4, i5, i6, i7)[i1 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2 / vl, i3, i4, i5, i6, i7)[i2 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2 / vl, i3, i4, i5, i6, i7)[i2 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3 / vl, i4, i5, i6, i7)[i3 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3 / vl, i4, i5, i6, i7)[i3 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4 / vl, i5, i6, i7)[i4 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4 / vl, i5, i6, i7)[i4 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5 / vl, i6, i7)[i5 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5 / vl, i6, i7)[i5 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6 / vl, i7)[i6 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6 / vl, i7)[i6 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template -void impl_verify_vector_view( - const VectorViewType& a, - const SimdViewAccess >& b) { +void impl_verify_vector_view(const VectorViewType& a, const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP - EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6, i7 / vl)[i7 % vl], - b(i0, i1, i2, i3, i4, i5, i6, i7), eps); + EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6, i7 / vl)[i7 % vl], b(i0, i1, i2, i3, i4, i5, i6, i7), eps); } template @@ -169,183 +143,90 @@ void impl_test_batched_vector_view() { { /// rank 1 array Kokkos::View a("a", test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); } { /// rank 2 array - Kokkos::View a("a", test_view_size, - test_view_size); + Kokkos::View a("a", test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, SimdViewAccess, PackDim<1> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); } { /// rank 3 array - Kokkos::View a("a", test_view_size, - test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<1> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<2> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); } { /// rank 4 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<1> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<2> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<3> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); } { /// rank 5 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<0> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<1> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<2> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<3> >( - a)); - impl_verify_vector_view( - a, - SimdViewAccess, PackDim<4> >( - a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); } { /// rank 6 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<0> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<1> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<2> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<3> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<4> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<5> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<5> >(a)); } { /// rank 7 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size, test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<0> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<1> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<2> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<3> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<4> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<5> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<6> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<5> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<6> >(a)); } { /// rank 8 array - Kokkos::View a( - "a", test_view_size, test_view_size, test_view_size, test_view_size, - test_view_size, test_view_size, test_view_size, test_view_size); + Kokkos::View a("a", test_view_size, test_view_size, test_view_size, test_view_size, + test_view_size, test_view_size, test_view_size, test_view_size); impl_init_vector_view(a); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<0> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<1> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<2> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<3> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<4> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<5> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<6> >(a)); - impl_verify_vector_view( - a, SimdViewAccess, - PackDim<7> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<0> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<1> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<2> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<3> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<4> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<5> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<6> >(a)); + impl_verify_vector_view(a, SimdViewAccess, PackDim<7> >(a)); } } } // namespace Test template int test_batched_vector_view() { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "vector datatype is only tested on host space"); - Test::impl_test_batched_vector_view(); + static_assert(Kokkos::SpaceAccessibility::accessible, + "vector datatype is only tested on host space"); + Test::impl_test_batched_vector_view(); return 0; } @@ -355,18 +236,12 @@ int test_batched_vector_view() { /// #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_vector_view_simd_float8) { - test_batched_vector_view, 8>(); -} +TEST_F(TestCategory, batched_vector_view_simd_float8) { test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_vector_view_simd_double4) { - test_batched_vector_view, 4>(); -} -TEST_F(TestCategory, batched_vector_view_simd_double8) { - test_batched_vector_view, 8>(); -} +TEST_F(TestCategory, batched_vector_view_simd_double4) { test_batched_vector_view, 4>(); } +TEST_F(TestCategory, batched_vector_view_simd_double8) { test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) @@ -383,8 +258,7 @@ TEST_F(TestCategory, batched_vector_view_simd_dcomplex2) { test_batched_vector_view >, 2>(); } -#if defined(KOKKOS_COMPILER_INTEL) && \ - ((KOKKOS_COMPILER_INTEL > 1900) && (KOKKOS_COMPILER_INTEL <= 2021)) +#if defined(KOKKOS_COMPILER_INTEL) && ((KOKKOS_COMPILER_INTEL > 1900) && (KOKKOS_COMPILER_INTEL <= 2021)) TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { printf( "Skipped: intel compiler version > 19.0.05 && <= 2021\n" diff --git a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index c11ad96959..9aa4b95f2c 100644 --- a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -35,16 +35,14 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { +KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle, const TMPViewType& _TMPView, + const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; const size_t maximum_iteration = handle.get_max_iteration(); const MagnitudeType tolerance = handle.get_tolerance(); @@ -59,14 +57,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( int offset_R = offset_Q + numRows; int offset_X = offset_R + numRows; - auto P = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_P, offset_P + numRows)); - auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_Q, offset_Q + numRows)); - auto R = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_R, offset_R + numRows)); - auto X = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + numRows)); + auto P = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_P, offset_P + numRows)); + auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_Q, offset_Q + numRows)); + auto R = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_R, offset_R + numRows)); + auto X = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_X, offset_X + numRows)); auto sqr_norm_0 = Kokkos::subview(_TMPNormView, Kokkos::ALL, 0); auto sqr_norm_j = Kokkos::subview(_TMPNormView, Kokkos::ALL, 1); @@ -90,10 +84,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; - }); + [&](const OrdinalType& i) { mask(i) = sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; }); TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); @@ -109,10 +100,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; }); member.team_barrier(); // x_{j+1} := alpha p_j + x_j @@ -131,10 +119,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j); @@ -167,55 +152,43 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - 4 * numRows); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 4 * numRows); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } return 0; } diff --git a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index bf2f1d2e86..82c62624c1 100644 --- a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -34,15 +34,14 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandle& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { +template +KOKKOS_INLINE_FUNCTION int TeamCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandle& handle, const TMPViewType& _TMPView, + const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; size_t maximum_iteration = handle.get_max_iteration(); const MagnitudeType tolerance = handle.get_tolerance(); @@ -57,14 +56,10 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( int offset_R = offset_Q + numRows; int offset_X = offset_R + numRows; - auto P = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_P, offset_P + numRows)); - auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_Q, offset_Q + numRows)); - auto R = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_R, offset_R + numRows)); - auto X = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + numRows)); + auto P = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_P, offset_P + numRows)); + auto Q = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_Q, offset_Q + numRows)); + auto R = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_R, offset_R + numRows)); + auto X = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_X, offset_X + numRows)); auto sqr_norm_0 = Kokkos::subview(_TMPNormView, Kokkos::ALL, 0); auto sqr_norm_j = Kokkos::subview(_TMPNormView, Kokkos::ALL, 1); @@ -88,10 +83,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; - }); + [&](const OrdinalType& i) { mask(i) = sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; }); TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); @@ -107,10 +99,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; }); member.team_barrier(); // x_{j+1} := alpha p_j + x_j @@ -129,10 +118,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); + [&](const OrdinalType& i) { alpha(i) = mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); TeamCopy1D::invoke(member, tmp, sqr_norm_j); @@ -165,55 +151,43 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( } template -template -KOKKOS_INLINE_FUNCTION int TeamCG::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamCG::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - 4 * numRows); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 4 * numRows); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type**, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadNormViewType = + Kokkos::View::mag_type**, + typename VectorViewType::execution_space::scratch_memory_space>; const int numMatrices = _X.extent(0); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - ScratchPadNormViewType _TMPNormView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); + ScratchPadNormViewType _TMPNormView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, 5); - return invoke( - member, A, _B, _X, handle, _TMPView, _TMPNormView); + return invoke(member, A, _B, _X, handle, _TMPView, _TMPNormView); } return 0; } diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index 923b67c105..2d8c0cae00 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -36,17 +36,12 @@ namespace KokkosBatched { /// Serial GMRES /// -template -KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle, - const int GMRES_id) { +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const int GMRES_id) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; typedef Kokkos::ArithTraits ATM; using SerialCopy1D = SerialCopy; @@ -55,9 +50,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); const MagnitudeType max_tolerance = handle.get_max_tolerance(); @@ -72,15 +65,12 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const int first_matrix = handle.first_index(GMRES_id); const int last_matrix = handle.last_index(GMRES_id); - auto V_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + auto V_view = Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, + Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, + Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; @@ -91,18 +81,12 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - auto G = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::make_pair(offset_W, offset_W + n_W)); - auto mask = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - offset_mask); - auto tmp = - Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), offset_tmp); + auto G = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto mask = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), offset_mask); + auto tmp = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), offset_tmp); // Deep copy of b into r_0: SerialCopy2D::invoke(_B, W); @@ -149,19 +133,14 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, if (handle.get_ortho_strategy() == 0) { for (OrdinalType l = 0; l < numMatrices; ++l) { auto W_l = Kokkos::subview(W, l, Kokkos::ALL); - auto V_old = Kokkos::subview( - V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = - Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); + auto V_old = Kokkos::subview(V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); // Inner products - KokkosBlas::SerialGemv::invoke(1, V_old, W_l, 0, - H_old); + KokkosBlas::SerialGemv::invoke(1, V_old, W_l, 0, H_old); // Update - KokkosBlas::SerialGemv::invoke( - -1, V_old, H_old, 1, W_l); + KokkosBlas::SerialGemv::invoke(-1, V_old, H_old, 1, W_l); } } if (handle.get_ortho_strategy() == 1) { @@ -179,8 +158,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, for (OrdinalType i = 0; i < numMatrices; ++i) { H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = - H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + tmp(i) = H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; } if (j + 1 < maximum_iteration) { @@ -207,8 +185,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, } // Compute the new Givens rotation: - Kokkos::pair + Kokkos::pair G_new(1, 0); typename VectorViewType::non_const_value_type alpha = 0; SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); @@ -241,8 +218,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, } bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); + for (OrdinalType l = 0; l < numMatrices; ++l) all_converged = (all_converged && mask(l) == 0.); if (all_converged) { maximum_iteration = j + 1; break; @@ -255,23 +231,19 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); auto B_l = Kokkos::subview(G, l, first_indices); - SerialTrsm::invoke(1, A_l, B_l); + SerialTrsm::invoke(1, A_l, B_l); } if (handle.get_ortho_strategy() == 0) { for (OrdinalType l = 0; l < numMatrices; ++l) { KokkosBlas::SerialGemv::invoke( - 1, Kokkos::subview(V_view, l, first_indices, Kokkos::ALL), - Kokkos::subview(G, l, first_indices), 1, + 1, Kokkos::subview(V_view, l, first_indices, Kokkos::ALL), Kokkos::subview(G, l, first_indices), 1, Kokkos::subview(_X, l, Kokkos::ALL)); } } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { - SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), - _X); + SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); } } @@ -289,12 +261,9 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, return status; } -template -KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { Identity P; return invoke(A, _B, _X, P, handle); } diff --git a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index a7219ecc91..8d37b2ac5e 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -39,17 +39,16 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView) { +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle, + const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; typedef Kokkos::ArithTraits ATM; using TeamVectorCopy1D = TeamVectorCopy; @@ -57,9 +56,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); const MagnitudeType max_tolerance = handle.get_max_tolerance(); @@ -71,13 +68,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( int offset_H = offset_V + n_V; int offset_Givens = offset_H + n_H; - auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - _ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; @@ -88,10 +82,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - auto G = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_W, offset_W + n_W)); + auto G = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_W, offset_W + n_W)); auto mask = Kokkos::subview(_TMPView, Kokkos::ALL, offset_mask); auto tmp = Kokkos::subview(_TMPView, Kokkos::ALL, offset_tmp); @@ -109,33 +101,29 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_norm(member.league_rank(), i, 0, tmp(i)); - if (tmp(i) > max_tolerance) { - mask(i) = 1; - G(i, 0) = tmp(i); - tmp(i) = 1. / tmp(i); - } else { - handle.set_iteration(member.league_rank(), i, 0); - mask(i) = 0; - G(i, 0) = 0.; - tmp(i) = 0.; - } - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); member.team_barrier(); // Finish writing to tmp auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); int status = 1; // int number_not_converged = 0; @@ -151,20 +139,14 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( member.team_barrier(); if (handle.get_ortho_strategy() == 0) { - auto V_old = Kokkos::subview( - V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, - Kokkos::make_pair(0, (int)j + 1)); + auto V_old = Kokkos::subview(V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, Kokkos::make_pair(0, (int)j + 1)); // Inner products - TeamVectorGemv::invoke(member, 1, V_old, W, 0, - H_old); + TeamVectorGemv::invoke(member, 1, V_old, W, 0, H_old); member.team_barrier(); // Update - TeamVectorGemv::invoke(member, -1, V_old, H_old, 1, - W); + TeamVectorGemv::invoke(member, -1, V_old, H_old, 1, W); member.team_barrier(); // Finish writing to W } if (handle.get_ortho_strategy() == 1) { @@ -172,12 +154,10 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); TeamVectorDot::invoke(member, W, V_i, tmp); member.team_barrier(); - TeamVectorCopy1D::invoke(member, tmp, - Kokkos::subview(H_view, Kokkos::ALL, j, i)); + TeamVectorCopy1D::invoke(member, tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); member.team_barrier(); // Finish writing to tmp @@ -188,82 +168,71 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = H_view(i, j, j + 1) > max_tolerance - ? 1. / H_view(i, j, j + 1) - : 0.; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + }); member.team_barrier(); if (j + 1 < maximum_iteration) { auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); - auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); - auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); - auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens_0_l(j) = G_new.first; - Givens_1_l(j) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); - auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens_1_l(j) * G(l, j); - G(l, j) *= Givens_0_l(j); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; - } - - auto res_norm = - Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); - - handle.set_norm(member.league_rank(), l, j + 1, res_norm); - - if (mask(l) == 1. && res_norm < tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - handle.set_iteration(member.league_rank(), l, j + 1); - } - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } + }); member.team_barrier(); bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); + for (OrdinalType l = 0; l < numMatrices; ++l) all_converged = (all_converged && mask(l) == 0.); if (all_converged) { maximum_iteration = j + 1; break; @@ -274,30 +243,25 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); - auto B_l = Kokkos::subview(G, l, first_indices); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - SerialTrsm::invoke(1, A_l, B_l); - }); + SerialTrsm::invoke(1, A_l, B_l); + }); member.team_barrier(); // Finish writing to G if (handle.get_ortho_strategy() == 0) { TeamVectorGemv::invoke( - member, 1, - Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), Kokkos::subview(G, Kokkos::ALL, first_indices), 1, _X); member.team_barrier(); // Finish writing to _X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { - TeamVectorAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); + TeamVectorAxpy::invoke(member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); member.team_barrier(); // Finish writing to _X } } @@ -305,128 +269,105 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( if (handle.get_compute_last_residual()) { TeamVectorCopy::invoke(member, _B, W); member.team_barrier(); - A.template apply(member, _X, W, -1, - 1); + A.template apply(member, _X, W, -1, 1); member.team_barrier(); P.template apply(member, W, W); member.team_barrier(); TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_last_norm(member.league_rank(), i, - tmp(i)); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, tmp(i)); + }); } return status; } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 2) { - using ScratchPadArnoldiViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadArnoldiViewType = + Kokkos::View; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - ScratchPadArnoldiViewType _ArnoldiView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - maximum_iteration, numRows + maximum_iteration + 3); + ScratchPadArnoldiViewType _ArnoldiView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + maximum_iteration, numRows + maximum_iteration + 3); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } return 0; } template -template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { Identity P; - return invoke(member, A, _B, _X, P, - handle); + return invoke(member, A, _B, _X, P, handle); } } // namespace KokkosBatched diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index bb8f446f07..9fd9e09bd9 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -38,17 +38,15 @@ namespace KokkosBatched { /// template -template -KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView) { +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, const KrylovHandleType& handle, + const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef typename Kokkos::ArithTraits::mag_type MagnitudeType; typedef Kokkos::ArithTraits ATM; using TeamCopy1D = TeamCopy; @@ -56,9 +54,7 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); const MagnitudeType max_tolerance = handle.get_max_tolerance(); @@ -70,13 +66,10 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( int offset_H = offset_V + n_V; int offset_Givens = offset_H + n_H; - auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - _ArnoldiView, Kokkos::ALL, Kokkos::ALL, - Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + auto V_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview(_ArnoldiView, Kokkos::ALL, Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); int n_G = maximum_iteration + 1; int n_W = numRows; @@ -87,10 +80,8 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( int offset_mask = offset_W + n_W; int offset_tmp = offset_mask + n_mask; - auto G = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(_TMPView, Kokkos::ALL, - Kokkos::make_pair(offset_W, offset_W + n_W)); + auto G = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(_TMPView, Kokkos::ALL, Kokkos::make_pair(offset_W, offset_W + n_W)); auto mask = Kokkos::subview(_TMPView, Kokkos::ALL, offset_mask); auto tmp = Kokkos::subview(_TMPView, Kokkos::ALL, offset_tmp); @@ -108,33 +99,29 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( TeamDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_norm(member.league_rank(), i, 0, tmp(i)); - if (tmp(i) > max_tolerance) { - mask(i) = 1; - G(i, 0) = tmp(i); - tmp(i) = 1. / tmp(i); - } else { - handle.set_iteration(member.league_rank(), i, 0); - mask(i) = 0; - G(i, 0) = 0.; - tmp(i) = 0.; - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); member.team_barrier(); // Finish writing to tmp auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); int status = 1; // int number_not_converged = 0; @@ -150,18 +137,14 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( member.team_barrier(); if (handle.get_ortho_strategy() == 0) { - auto V_old = Kokkos::subview( - V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, - Kokkos::make_pair(0, (int)j + 1)); + auto V_old = Kokkos::subview(V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, Kokkos::make_pair(0, (int)j + 1)); // Inner products - TeamGemv::invoke( - member, 1, V_old, W, 0, H_old); + TeamGemv::invoke(member, 1, V_old, W, 0, H_old); member.team_barrier(); // Update - TeamGemv::invoke( - member, -1, V_old, H_old, 1, W); + TeamGemv::invoke(member, -1, V_old, H_old, 1, W); member.team_barrier(); // Finish writing to W } if (handle.get_ortho_strategy() == 1) { @@ -169,12 +152,10 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); TeamDot::invoke(member, W, V_i, tmp); member.team_barrier(); - TeamCopy1D::invoke(member, tmp, - Kokkos::subview(H_view, Kokkos::ALL, j, i)); + TeamCopy1D::invoke(member, tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); member.team_barrier(); // Finish writing to tmp @@ -185,82 +166,71 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( TeamDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = H_view(i, j, j + 1) > max_tolerance - ? 1. / H_view(i, j, j + 1) - : 0.; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + }); member.team_barrier(); if (j + 1 < maximum_iteration) { auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); member.team_barrier(); } - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); - auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); - auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); - auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens_0_l(j) = G_new.first; - Givens_1_l(j) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); - auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens_1_l(j) * G(l, j); - G(l, j) *= Givens_0_l(j); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; - } - - auto res_norm = - Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); - - handle.set_norm(member.league_rank(), l, j + 1, res_norm); - - if (mask(l) == 1. && res_norm < tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - handle.set_iteration(member.league_rank(), l, j + 1); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } + }); member.team_barrier(); bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); + for (OrdinalType l = 0; l < numMatrices; ++l) all_converged = (all_converged && mask(l) == 0.); if (all_converged) { maximum_iteration = j + 1; break; @@ -271,30 +241,25 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); - auto B_l = Kokkos::subview(G, l, first_indices); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - SerialTrsm::invoke(1, A_l, B_l); - }); + SerialTrsm::invoke(1, A_l, B_l); + }); member.team_barrier(); // Finish writing to G if (handle.get_ortho_strategy() == 0) { TeamGemv::invoke( - member, 1, - Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), Kokkos::subview(G, Kokkos::ALL, first_indices), 1, _X); member.team_barrier(); // Finish writing to _X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { - TeamAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); + TeamAxpy::invoke(member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), _X); member.team_barrier(); // Finish writing to _X } } @@ -309,120 +274,97 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( TeamDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_last_norm(member.league_rank(), i, - tmp(i)); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, tmp(i)); + }); } return status; } template -template -KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, const KrylovHandleType& handle) { const int strategy = handle.get_memory_strategy(); if (strategy == 0) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 1) { const int first_matrix = handle.first_index(member.league_rank()); const int last_matrix = handle.last_index(member.league_rank()); - auto _ArnoldiView = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::ALL); + auto _ArnoldiView = + Kokkos::subview(handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL, Kokkos::ALL); - auto _TMPView = Kokkos::subview( - handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto _TMPView = Kokkos::subview(handle.tmp_view, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } if (strategy == 2) { - using ScratchPadArnoldiViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadArnoldiViewType = + Kokkos::View; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = + Kokkos::View; const int numMatrices = _X.extent(0); const int numRows = _X.extent(1); - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; + size_t maximum_iteration = handle.get_max_iteration() < numRows ? handle.get_max_iteration() : numRows; int n_G = maximum_iteration + 1; int n_W = numRows; int n_mask = 1; int n_tmp = 1; - ScratchPadArnoldiViewType _ArnoldiView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - maximum_iteration, numRows + maximum_iteration + 3); + ScratchPadArnoldiViewType _ArnoldiView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + maximum_iteration, numRows + maximum_iteration + 3); - ScratchPadVectorViewType _TMPView( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_mask + n_tmp); + ScratchPadVectorViewType _TMPView(member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_mask + n_tmp); - return invoke(member, A, _B, _X, P, handle, _ArnoldiView, - _TMPView); + return invoke(member, A, _B, _X, P, handle, + _ArnoldiView, _TMPView); } return 0; } template -template -KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle) { +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke(const MemberType& member, const OperatorType& A, + const VectorViewType& _B, const VectorViewType& _X, + const KrylovHandleType& handle) { Identity P; - return invoke(member, A, _B, _X, P, - handle); + return invoke(member, A, _B, _X, P, handle); } } // namespace KokkosBatched diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index 0f1e5feb39..3f76ee3d9f 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -26,35 +26,24 @@ namespace KokkosBatched { /// Serial Internal Impl /// ==================== struct SerialSpmvInternal { - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const OrdinalType numMatrices, const OrdinalType numRows, - const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, - const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, + const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; } sum *= alpha[iMatrix * alphas0]; @@ -62,8 +51,7 @@ struct SerialSpmvInternal { if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } @@ -71,33 +59,26 @@ struct SerialSpmvInternal { return 0; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const OrdinalType numMatrices, const OrdinalType numRows, - const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, + const OrdinalType ys1) { for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; } sum *= alpha; @@ -105,8 +86,7 @@ struct SerialSpmvInternal { if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } @@ -117,47 +97,32 @@ struct SerialSpmvInternal { template <> struct SerialSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const alphaViewType& alpha, const ValuesViewType& values, - const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const betaViewType& beta, const yViewType& Y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType& alpha, const ValuesViewType& values, + const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const betaViewType& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != alpha.extent(0)) { @@ -178,8 +143,7 @@ struct SerialSpmv { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -187,8 +151,7 @@ struct SerialSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -201,61 +164,43 @@ struct SerialSpmv { #endif return SerialSpmvInternal::template invoke< - typename alphaViewType::non_const_value_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta>( - X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), - values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), - Y.stride_0(), Y.stride_1()); + typename alphaViewType::non_const_value_type, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( + X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), values.stride_0(), values.stride_1(), + row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), + X.stride_1(), beta.data(), beta.stride_0(), Y.data(), Y.stride_0(), Y.stride_1()); } - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& beta, + const typename Kokkos::ArithTraits::mag_type& alpha, + const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const typename Kokkos::ArithTraits::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != values.extent(0)) { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -263,8 +208,7 @@ struct SerialSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -277,15 +221,12 @@ struct SerialSpmv { #endif return SerialSpmvInternal::template invoke< - typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, + typename Kokkos::ArithTraits::mag_type, + typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( - X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), - values.stride_1(), row_ptr.data(), row_ptr.stride_0(), - colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), - X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1()); + X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), + row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), X.stride_1(), beta, + Y.data(), Y.stride_0(), Y.stride_1()); } }; diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index dd510b2d0e..4df4b95e2c 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -27,50 +27,40 @@ namespace KokkosBatched { /// TeamVector Internal Impl /// ==================== struct TeamVectorSpmvInternal { - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); - - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OrdinalType numMatrices, + const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, + const OrdinalType ys1); }; -template +template KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) if (member.team_size() == 1) { if (N_team > 1 && valuess0 == 1) { @@ -87,8 +77,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( beta_v.loadAligned(beta); for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; VectorType sum_v(0); @@ -96,11 +85,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - values_v.loadAligned( - &values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); - x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]); + values_v.loadAligned(&values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); + x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]); sum_v += values_v * x_v; } sum_v *= alpha_v; @@ -113,20 +99,14 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( } else { for (unsigned iMatrix = 0; iMatrix < unsigned(numMatrices); ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; ValueType sum = 0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, rowLength), [&](const OrdinalType& iEntry, ValueType& lsum) { - lsum += - values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + lsum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; }, sum); @@ -135,63 +115,50 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } } } else { #endif - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); - - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha[iMatrix * alphas0]; + sum *= alpha[iMatrix * alphas0]; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) } #endif return 0; } -template +template KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) if (member.team_size() == 1) { if (N_team > 1 && valuess0 == 1 && valuess1 % N_team == 0) { @@ -205,8 +172,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( VectorType alpha_v(alpha), beta_v(beta), values_v, y_v, x_v; for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; VectorType sum_v(0); @@ -214,11 +180,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( #pragma unroll #endif for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - values_v.loadAligned( - &values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); - x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]); + values_v.loadAligned(&values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess1]); + x_v.loadAligned(&X[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]); sum_v += values_v * x_v; } sum_v *= alpha_v; @@ -231,20 +194,14 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( } else { for (unsigned iMatrix = 0; iMatrix < unsigned(numMatrices); ++iMatrix) { for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; ValueType sum = 0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, rowLength), [&](const OrdinalType& iEntry, ValueType& lsum) { - lsum += - values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; + lsum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; }, sum); @@ -253,45 +210,35 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( if (dobeta == 0) { Y[iMatrix * ys0 + iRow * ys1] = sum; } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; } } } } } else { #endif - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); - - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); + + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha; + sum *= alpha; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) } #endif @@ -300,52 +247,35 @@ KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( template struct TeamVectorSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const alphaViewType& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, const betaViewType& beta, - const yViewType& Y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const alphaViewType& alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& X, const betaViewType& beta, + const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != alpha.extent(0)) { @@ -366,8 +296,7 @@ struct TeamVectorSpmv { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -375,8 +304,7 @@ struct TeamVectorSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -389,68 +317,49 @@ struct TeamVectorSpmv { #endif if (values.extent(0) == 1) { return KokkosSparse::Experimental::team_vector_spmv( - member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), - row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), - beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, + Kokkos::subview(X, 0, Kokkos::ALL), beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamVectorSpmvInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta, N_team>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), - Y.stride_0(), Y.stride_1()); + MemberType, typename alphaViewType::non_const_value_type, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta, N_team>( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), values.stride_0(), + values.stride_1(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), + X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), Y.stride_0(), Y.stride_1()); } - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& beta, + const typename Kokkos::ArithTraits::mag_type& alpha, + const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const typename Kokkos::ArithTraits::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != values.extent(0)) { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -458,8 +367,7 @@ struct TeamVectorSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -472,22 +380,17 @@ struct TeamVectorSpmv { #endif if (values.extent(0) == 1) { return KokkosSparse::Experimental::team_vector_spmv( - member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, - colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, - Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, + Kokkos::subview(X, 0, Kokkos::ALL), beta, Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamVectorSpmvInternal::template invoke< - MemberType, - typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, + MemberType, typename Kokkos::ArithTraits::mag_type, + typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta, N_team>( - member, X.extent(0), X.extent(1), alpha, values.data(), - values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1()); + member, X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), + row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), X.stride_1(), beta, + Y.data(), Y.stride_0(), Y.stride_1()); } }; diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index 41128744a3..9e32861612 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -27,176 +27,130 @@ namespace KokkosBatched { /// Team Internal Impl /// ==================== struct TeamSpmvInternal { - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1); - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OrdinalType numMatrices, + const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, + const OrdinalType ys1); }; -template +template KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha, - const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, - const OrdinalType valuess0, const OrdinalType valuess1, - const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, - const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, - const OrdinalType xs0, const OrdinalType xs1, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, + const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, + const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha[iMatrix * alphas0]; + sum *= alpha[iMatrix * alphas0]; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); return 0; } -template +template KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke( - const MemberType& member, const OrdinalType numMatrices, - const OrdinalType numRows, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, - const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr, - const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices, - const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, + const MemberType& member, const OrdinalType numMatrices, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType valuess1, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X, const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta, - /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, - const OrdinalType ys1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices(iTemp, numRows, numMatrices, iRow, - iMatrix); + /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0, const OrdinalType ys1) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices(iTemp, numRows, numMatrices, iRow, iMatrix); - const OrdinalType rowLength = - row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; - ValueType sum = 0; + const OrdinalType rowLength = row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { - sum += values[iMatrix * valuess0 + - (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * - X[iMatrix * xs0 + - colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * - colIndicess0] * - xs1]; - } + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[iMatrix * valuess0 + (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] * + X[iMatrix * xs0 + colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * colIndicess0] * xs1]; + } - sum *= alpha; + sum *= alpha; - if (dobeta == 0) { - Y[iMatrix * ys0 + iRow * ys1] = sum; - } else { - Y[iMatrix * ys0 + iRow * ys1] = - beta * Y[iMatrix * ys0 + iRow * ys1] + sum; - } - }); + if (dobeta == 0) { + Y[iMatrix * ys0 + iRow * ys1] = sum; + } else { + Y[iMatrix * ys0 + iRow * ys1] = beta * Y[iMatrix * ys0 + iRow * ys1] + sum; + } + }); return 0; } template struct TeamSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const alphaViewType& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, const betaViewType& beta, - const yViewType& Y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const alphaViewType& alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& X, const betaViewType& beta, + const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: alphaViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::rank == 1, - "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::rank == 1, - "KokkosBatched::spmv: betaViewType must have rank 1."); + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != alpha.extent(0)) { @@ -217,8 +171,7 @@ struct TeamSpmv { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -226,8 +179,7 @@ struct TeamSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -240,68 +192,49 @@ struct TeamSpmv { #endif if (values.extent(0) == 1) { return KokkosSparse::Experimental::team_spmv( - member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), - row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), - beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, + Kokkos::subview(X, 0, Kokkos::ALL), beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamSpmvInternal::template invoke< - MemberType, typename alphaViewType::non_const_value_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, - typename ValuesViewType::array_layout, dobeta>( - member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), - values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), - Y.stride_0(), Y.stride_1()); + MemberType, typename alphaViewType::non_const_value_type, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( + member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(), values.stride_0(), + values.stride_1(), row_ptr.data(), row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), + X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(), Y.stride_0(), Y.stride_1()); } - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& alpha, - const ValuesViewType& values, const IntView& row_ptr, - const IntView& colIndices, const xViewType& X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type& beta, + const typename Kokkos::ArithTraits::mag_type& alpha, + const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, + const typename Kokkos::ArithTraits::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: IntView is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: xViewType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::spmv: yViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: IntView is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: xViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::rank == 2, - "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::rank == 1, - "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::rank == 2, - "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::rank == 2, - "KokkosBatched::spmv: yViewType must have rank 2."); + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::printf( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), - (int)Y.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); return 1; } if (X.extent(0) != values.extent(0)) { Kokkos::printf( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", - (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), - (int)values.extent(1)); + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); return 1; } if (colIndices.extent(0) != values.extent(1)) { @@ -309,8 +242,7 @@ struct TeamSpmv { "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", - (int)colIndices.extent(0), (int)values.extent(0), - (int)values.extent(1)); + (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { @@ -322,23 +254,18 @@ struct TeamSpmv { } #endif if (values.extent(0) == 1) { - return KokkosSparse::Experimental::team_spmv( - member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, - colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, - Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + return KokkosSparse::Experimental::team_spmv(member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), + row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), + beta, Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); } return TeamSpmvInternal::template invoke< - MemberType, - typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type, - typename ValuesViewType::non_const_value_type, - typename IntView::non_const_value_type, + MemberType, typename Kokkos::ArithTraits::mag_type, + typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, typename ValuesViewType::array_layout, dobeta>( - member, X.extent(0), X.extent(1), alpha, values.data(), - values.stride_0(), values.stride_1(), row_ptr.data(), - row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), - X.stride_0(), X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1()); + member, X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(), values.stride_1(), row_ptr.data(), + row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(), X.stride_1(), beta, + Y.data(), Y.stride_0(), Y.stride_1()); } }; diff --git a/batched/sparse/src/KokkosBatched_CG.hpp b/batched/sparse/src/KokkosBatched_CG.hpp index baa6dca42e..cabf2eae98 100644 --- a/batched/sparse/src/KokkosBatched_CG.hpp +++ b/batched/sparse/src/KokkosBatched_CG.hpp @@ -42,22 +42,14 @@ namespace KokkosBatched { template struct CG { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const OperatorType &A, - const VectorViewType &B, - const VectorViewType &X, - const KrylovHandleType &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const OperatorType &A, const VectorViewType &B, + const VectorViewType &X, const KrylovHandleType &handle) { int status = 0; if (std::is_same::value) { - status = - TeamCG::template invoke( - member, A, B, X, handle); + status = TeamCG::template invoke(member, A, B, X, handle); } else if (std::is_same::value) { - status = TeamVectorCG::template invoke( - member, A, B, X, handle); + status = TeamVectorCG::template invoke(member, A, B, X, handle); } return status; } diff --git a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp index 92acc91a9e..0d880cd880 100644 --- a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp +++ b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp @@ -42,8 +42,7 @@ class CrsMatrix { public: KOKKOS_INLINE_FUNCTION - CrsMatrix(const ValuesViewType &_values, const IntViewType &_row_ptr, - const IntViewType &_colIndices) + CrsMatrix(const ValuesViewType &_values, const IntViewType &_row_ptr, const IntViewType &_colIndices) : values(_values), row_ptr(_row_ptr), colIndices(_colIndices) { n_operators = _values.extent(0); n_rows = _row_ptr.extent(0) - 1; @@ -77,45 +76,40 @@ class CrsMatrix { /// \param beta [in]: input coefficient for Y (default value 0.) /// \param Y [in/out]: Output vector Y, a rank 2 view - template - KOKKOS_INLINE_FUNCTION void apply( - const MemberType &member, const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::ArithTraits::one(), - MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + template + KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y, + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { if (beta == Kokkos::ArithTraits::zero()) { if (member.team_size() == 1 && n_operators == 8) - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 0>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 0>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); } else { if (member.team_size() == 1 && n_operators == 8) - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 1>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::TeamVectorSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 1>( + KokkosBatched::TeamVectorSpmv::template invoke( member, alpha, values, row_ptr, colIndices, X, beta, Y); } } template - KOKKOS_INLINE_FUNCTION void apply( - const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::ArithTraits::one(), - MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, const YViewType &Y, + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { if (beta == Kokkos::ArithTraits::zero()) - KokkosBatched::SerialSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 0>( + KokkosBatched::SerialSpmv::template invoke( alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::SerialSpmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, 1>( + KokkosBatched::SerialSpmv::template invoke( alpha, values, row_ptr, colIndices, X, beta, Y); } }; diff --git a/batched/sparse/src/KokkosBatched_GMRES.hpp b/batched/sparse/src/KokkosBatched_GMRES.hpp index 0d27bcd6fb..a3f4eda8d3 100644 --- a/batched/sparse/src/KokkosBatched_GMRES.hpp +++ b/batched/sparse/src/KokkosBatched_GMRES.hpp @@ -44,25 +44,16 @@ namespace KokkosBatched { template struct GMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const OperatorType &A, - const VectorViewType &B, - const VectorViewType &X, - const KrylovHandleType &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const OperatorType &A, const VectorViewType &B, + const VectorViewType &X, const KrylovHandleType &handle) { int status = 0; if (std::is_same::value) { - status = SerialGMRES::template invoke( - A, B, X, handle); + status = SerialGMRES::template invoke(A, B, X, handle); } else if (std::is_same::value) { - status = - TeamGMRES::template invoke( - member, A, B, X, handle); + status = TeamGMRES::template invoke(member, A, B, X, handle); } else if (std::is_same::value) { - status = TeamVectorGMRES::template invoke( - member, A, B, X, handle); + status = TeamVectorGMRES::template invoke(member, A, B, X, handle); } return status; } diff --git a/batched/sparse/src/KokkosBatched_Identity.hpp b/batched/sparse/src/KokkosBatched_Identity.hpp index 4e8e7c4308..311ec09d5c 100644 --- a/batched/sparse/src/KokkosBatched_Identity.hpp +++ b/batched/sparse/src/KokkosBatched_Identity.hpp @@ -34,26 +34,21 @@ class Identity { KOKKOS_INLINE_FUNCTION ~Identity() {} - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { if (sameXY == 0) { if (std::is_same::value) { SerialCopy::invoke(X, Y); } else if (std::is_same::value) { TeamCopy::invoke(member, X, Y); - } else if (std::is_same::value) { + } else if (std::is_same::value) { TeamVectorCopy::invoke(member, X, Y); } } } - template - KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, const YViewType &Y) const { if (sameXY == 0) { SerialCopy::invoke(X, Y); } diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index eacb859636..580f85158b 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -75,15 +75,12 @@ class JacobiPrec { Kokkos::TeamThreadRange(member, 0, n_operators * n_rows), [&](const int &iTemp, int <ooSmall) { int i, j; - getIndices( - iTemp, n_rows, n_operators, j, i); - if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= - epsilon) { + getIndices(iTemp, n_rows, n_operators, j, i); + if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= epsilon) { ltooSmall++; diag_values_array[i * vs0 + j * vs1] = one; } else - diag_values_array[i * vs0 + j * vs1] = - one / diag_values_array[i * vs0 + j * vs1]; + diag_values_array[i * vs0 + j * vs1] = one / diag_values_array[i * vs0 + j * vs1]; }, tooSmall); } else if (std::is_same::value) { @@ -95,15 +92,12 @@ class JacobiPrec { Kokkos::TeamVectorRange(member, 0, n_operators * n_rows), [&](const int &iTemp, int <ooSmall) { int i, j; - getIndices( - iTemp, n_rows, n_operators, j, i); - if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= - epsilon) { + getIndices(iTemp, n_rows, n_operators, j, i); + if (Kokkos::abs(diag_values_array[i * vs0 + j * vs1]) <= epsilon) { ltooSmall++; diag_values_array[i * vs0 + j * vs1] = one; } else - diag_values_array[i * vs0 + j * vs1] = - one / diag_values_array[i * vs0 + j * vs1]; + diag_values_array[i * vs0 + j * vs1] = one / diag_values_array[i * vs0 + j * vs1]; }, tooSmall); } @@ -138,31 +132,25 @@ class JacobiPrec { computed_inverse = true; } - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { if (!computed_inverse) { this->computeInverse(member); member.team_barrier(); // Finish writing to this->diag_values } - KokkosBatched::HadamardProduct::template invoke< - ValuesViewType, XViewType, YViewType>(member, diag_values, X, Y); + KokkosBatched::HadamardProduct::template invoke( + member, diag_values, X, Y); } - template - KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, - const YViewType &Y) const { + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, const YViewType &Y) const { if (!computed_inverse) { this->computeInverse(); } - KokkosBatched::SerialHadamardProduct::template invoke( - diag_values, X, Y); + KokkosBatched::SerialHadamardProduct::template invoke(diag_values, X, Y); } }; diff --git a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index 9992742dd8..c8e8392e11 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -51,8 +51,7 @@ class KrylovHandle { using norm_type = typename NormViewType::non_const_value_type; typedef ViewType3D ArnoldiViewType; - typedef Kokkos::View TemporaryViewType; @@ -81,8 +80,7 @@ class KrylovHandle { bool host_synchronised; public: - KrylovHandle(int _batched_size, int _N_team, int _max_iteration = 200, - bool _monitor_residual = false) + KrylovHandle(int _batched_size, int _N_team, int _max_iteration = 200, bool _monitor_residual = false) : max_iteration(_max_iteration), batched_size(_batched_size), N_team(_N_team), @@ -192,9 +190,7 @@ class KrylovHandle { /// \param batched_id [in]: Global batched ID KOKKOS_INLINE_FUNCTION - bool is_converged(int batched_id) const { - return (iteration_numbers(batched_id) != -1); - } + bool is_converged(int batched_id) const { return (iteration_numbers(batched_id) != -1); } /// \brief is_converged /// Test if one particular system has converged (host). @@ -226,9 +222,7 @@ class KrylovHandle { /// \param _max_tolerance [in]: New tolerance KOKKOS_INLINE_FUNCTION - void set_max_tolerance(norm_type _max_tolerance) { - max_tolerance = _max_tolerance; - } + void set_max_tolerance(norm_type _max_tolerance) { max_tolerance = _max_tolerance; } /// \brief get_max_tolerance /// Get the maximal tolerance of the batched Krylov solver @@ -310,9 +304,7 @@ class KrylovHandle { /// \param batched_id [in]: Global batched ID KOKKOS_INLINE_FUNCTION - int get_iteration(int batched_id) const { - return iteration_numbers(batched_id); - } + int get_iteration(int batched_id) const { return iteration_numbers(batched_id); } /// \brief get_iteration_host /// Get the number of iteration after convergence for one system (host) @@ -332,9 +324,7 @@ class KrylovHandle { /// \param _ortho_strategy [in]: used orthogonalization strategy KOKKOS_INLINE_FUNCTION - void set_ortho_strategy(int _ortho_strategy) { - ortho_strategy = _ortho_strategy; - } + void set_ortho_strategy(int _ortho_strategy) { ortho_strategy = _ortho_strategy; } /// \brief get_ortho_strategy /// Get the used orthogonalization strategy. @@ -350,9 +340,7 @@ class KrylovHandle { /// \param _scratch_pad_level [in]: used level KOKKOS_INLINE_FUNCTION - void set_scratch_pad_level(int _scratch_pad_level) { - scratch_pad_level = _scratch_pad_level; - } + void set_scratch_pad_level(int _scratch_pad_level) { scratch_pad_level = _scratch_pad_level; } /// \brief get_scratch_pad_level /// Get the scratch pad level used to store temporary variables. @@ -386,9 +374,7 @@ class KrylovHandle { } KOKKOS_INLINE_FUNCTION - void set_memory_strategy(int _memory_strategy) { - memory_strategy = _memory_strategy; - } + void set_memory_strategy(int _memory_strategy) { memory_strategy = _memory_strategy; } KOKKOS_INLINE_FUNCTION int get_memory_strategy() const { return memory_strategy; } @@ -415,10 +401,8 @@ class KrylovHandle { /// \param norm_i [in]: Norm to store KOKKOS_INLINE_FUNCTION - void set_norm(int team_id, int batched_id, int iteration_id, - norm_type norm_i) const { - if (monitor_residual) - residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; + void set_norm(int team_id, int batched_id, int iteration_id, norm_type norm_i) const { + if (monitor_residual) residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; } /// \brief set_last_norm @@ -429,8 +413,7 @@ class KrylovHandle { KOKKOS_INLINE_FUNCTION void set_last_norm(int batched_id, norm_type norm_i) const { - if (monitor_residual) - residual_norms(batched_id, max_iteration + 1) = norm_i; + if (monitor_residual) residual_norms(batched_id, max_iteration + 1) = norm_i; } /// \brief set_last_norm @@ -442,8 +425,7 @@ class KrylovHandle { KOKKOS_INLINE_FUNCTION void set_last_norm(int team_id, int batched_id, norm_type norm_i) const { - if (monitor_residual) - residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; + if (monitor_residual) residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; } /// \brief set_iteration @@ -453,9 +435,7 @@ class KrylovHandle { /// \param iteration_id [in]: Iteration ID KOKKOS_INLINE_FUNCTION - void set_iteration(int batched_id, int iteration_id) const { - iteration_numbers(batched_id) = iteration_id; - } + void set_iteration(int batched_id, int iteration_id) const { iteration_numbers(batched_id) = iteration_id; } /// \brief set_iteration /// Store the number of iteration after convergence for one system diff --git a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp index 262167ee64..b07ed2b973 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp @@ -20,110 +20,71 @@ namespace KokkosBatched { struct SerialGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle, + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, + const PrecOperatorType& P, const KrylovHandleType& handle, const int GMRES_id); - template - KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamVectorGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, - const TMPViewType& _TMPView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, + const TMPViewType& _TMPView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; template struct TeamVectorCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const KrylovHandleType& handle, - const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle, + const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle); }; } // namespace KokkosBatched diff --git a/batched/sparse/src/KokkosBatched_Spmv.hpp b/batched/sparse/src/KokkosBatched_Spmv.hpp index da70acb6bb..a93d0775be 100644 --- a/batched/sparse/src/KokkosBatched_Spmv.hpp +++ b/batched/sparse/src/KokkosBatched_Spmv.hpp @@ -64,23 +64,17 @@ namespace KokkosBatched { template struct SerialSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const alphaViewType &alpha, const ValuesViewType &values, - const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const betaViewType &beta, const yViewType &Y); + template + KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha, const ValuesViewType &values, + const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const betaViewType &beta, const yViewType &Y); - template + template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &X, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &X, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &Y); }; @@ -126,25 +120,19 @@ struct SerialSpmv { template struct TeamSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const alphaViewType &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, const betaViewType &beta, - const yViewType &y); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, + const ValuesViewType &values, const IntView &row_ptr, + const IntView &colIndices, const xViewType &x, const betaViewType &beta, + const yViewType &y); - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &y); }; @@ -189,28 +177,21 @@ struct TeamSpmv { /// (or one with TeamVectorRange) are used inside. /// -template +template struct TeamVectorSpmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const alphaViewType &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, const betaViewType &beta, - const yViewType &y); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, + const ValuesViewType &values, const IntView &row_ptr, + const IntView &colIndices, const xViewType &x, const betaViewType &beta, + const yViewType &y); - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &y); }; @@ -245,58 +226,47 @@ struct TeamVectorSpmv { template struct Spmv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const alphaViewType &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, const betaViewType &beta, - const yViewType &y) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const alphaViewType &alpha, + const ValuesViewType &values, const IntView &row_ptr, + const IntView &colIndices, const xViewType &x, const betaViewType &beta, + const yViewType &y) { int r_val = 0; if (std::is_same::value) { - r_val = SerialSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(alpha, values, row_ptr, colIndices, x, beta, y); + r_val = + SerialSpmv::template invoke(alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(member, alpha, values, row_ptr, colIndices, x, - beta, y); + r_val = TeamSpmv::template invoke( + member, alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamVectorSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(member, alpha, values, row_ptr, colIndices, x, - beta, y); + r_val = TeamVectorSpmv::template invoke( + member, alpha, values, row_ptr, colIndices, x, beta, y); } return r_val; } - template + template KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &alpha, - const ValuesViewType &values, const IntView &row_ptr, - const IntView &colIndices, const xViewType &x, - const typename Kokkos::ArithTraits< - typename ValuesViewType::non_const_value_type>::mag_type &beta, + const typename Kokkos::ArithTraits::mag_type &alpha, + const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const typename Kokkos::ArithTraits::mag_type &beta, const yViewType &y) { int r_val = 0; if (std::is_same::value) { - r_val = - SerialSpmv::template invoke( - alpha, values, row_ptr, colIndices, x, beta, y); + r_val = SerialSpmv::template invoke( + alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, dobeta>( + r_val = TeamSpmv::template invoke( member, alpha, values, row_ptr, colIndices, x, beta, y); } else if (std::is_same::value) { - r_val = TeamVectorSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, dobeta>( - member, alpha, values, row_ptr, colIndices, x, beta, y); + r_val = + TeamVectorSpmv::template invoke( + member, alpha, values, row_ptr, colIndices, x, beta, y); } return r_val; } diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp index e28efb9b82..3147caefae 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp @@ -29,8 +29,8 @@ using namespace KokkosBatched; namespace Test { namespace GMRES { -template +template struct Functor_TestBatchedSerialGMRES { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -42,32 +42,19 @@ struct Functor_TestBatchedSerialGMRES { const int _N_team; KrylovHandleType _handle; - Functor_TestBatchedSerialGMRES(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, - const VectorViewType &diag, const int N_team, + Functor_TestBatchedSerialGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, + const VectorViewType &B, const VectorViewType &diag, const int N_team, KrylovHandleType &handle) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _Diag(diag), - _N_team(N_team), - _handle(handle) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team), _handle(handle) {} KOKKOS_INLINE_FUNCTION void operator()(const int k) const { const int first_matrix = _handle.first_index(k); const int last_matrix = _handle.last_index(k); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto diag = Kokkos::subview( - _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; using PrecOperator = KokkosBatched::JacobiPrec; @@ -76,8 +63,7 @@ struct Functor_TestBatchedSerialGMRES { PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::SerialGMRES::template invoke( - A, b, x, P, _handle, k); + KokkosBatched::SerialGMRES::template invoke(A, b, x, P, _handle, k); } inline void run() { @@ -96,18 +82,16 @@ struct Functor_TestBatchedSerialGMRES { _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); - _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( - "", N, maximum_iteration, n + maximum_iteration + 3); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", N, n + maximum_iteration + 3); + _handle.Arnoldi_view = + typename KrylovHandleType::ArnoldiViewType("", N, maximum_iteration, n + maximum_iteration + 3); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType("", N, n + maximum_iteration + 3); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -133,8 +117,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -153,12 +136,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { int current_index; for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = values_host(j, current_index); } Kokkos::deep_copy(Diag, diag_values_host); @@ -188,13 +169,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KrylovHandleType handle(N, N_team, n_iterations); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedSerialGMRES( + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedSerialGMRES( D, r, c, X, B, Diag, N_team, handle) .run(); @@ -205,17 +183,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e5 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK( - std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); } } // namespace GMRES } // namespace Test @@ -226,26 +200,21 @@ int test_batched_serial_GMRES() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp index ccfe3c37d5..2756e11a1f 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { - test_batched_serial_GMRES(); -} +TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { test_batched_serial_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { - test_batched_serial_GMRES(); -} +TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { test_batched_serial_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp index 05f2724c5b..2f32b6294a 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Spmv.hpp" #include "KokkosBatched_Spmv_Serial_Impl.hpp" @@ -37,9 +37,8 @@ struct ParamTag { typedef T trans; }; -template +template struct Functor_TestBatchedSerialSpmv { using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; @@ -51,10 +50,8 @@ struct Functor_TestBatchedSerialSpmv { const yViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialSpmv(const alphaViewType &alpha, - const ValuesViewType &D, const IntView &r, - const IntView &c, const xViewType &X, - const betaViewType &beta, const yViewType &Y) + Functor_TestBatchedSerialSpmv(const alphaViewType &alpha, const ValuesViewType &D, const IntView &r, const IntView &c, + const xViewType &X, const betaViewType &beta, const yViewType &Y) : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y) {} KOKKOS_INLINE_FUNCTION @@ -66,8 +63,8 @@ struct Functor_TestBatchedSerialSpmv { auto y = Kokkos::subview(_Y, Kokkos::make_pair(k, k + 1), Kokkos::ALL); KokkosBatched::SerialSpmv::template invoke< - ValuesViewType, IntView, xViewType, yViewType, alphaViewType, - betaViewType, dobeta>(alpha, d, _r, _c, x, beta, y); + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(alpha, d, _r, _c, x, beta, + y); } inline void run() { @@ -82,9 +79,8 @@ struct Functor_TestBatchedSerialSpmv { } }; -template +template void impl_test_batched_spmv(const int N, const int BlkSize) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -126,21 +122,15 @@ void impl_test_batched_spmv(const int N, const int BlkSize) { else Y0_host(l, i) *= beta_host(l); if (i != 0 && i != (BlkSize - 1)) - Y0_host(l, i) += - alpha_host(l) * - (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); else if (i == 0) - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); else - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); } - Functor_TestBatchedSerialSpmv(alpha, D, r, c, X1, beta, - Y1) + Functor_TestBatchedSerialSpmv(alpha, D, r, c, X1, beta, Y1) .run(); Kokkos::fence(); @@ -165,49 +155,37 @@ void impl_test_batched_spmv(const int N, const int BlkSize) { } // namespace Spmv } // namespace Test -template +template int test_batched_spmv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } for (int i = 3; i < 10; ++i) { - Test::Spmv::impl_test_batched_spmv(1024, - i); + Test::Spmv::impl_test_batched_spmv(1024, i); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp b/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp index 98bc25894f..808f95a9a7 100644 --- a/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp +++ b/batched/sparse/unit_test/Test_Batched_SparseUtils.hpp @@ -18,21 +18,12 @@ namespace KokkosBatched { template -void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, - const int N, const IntView &r, - const IntView &c, - const VectorViewType &D, - const VectorViewType &X, +void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, const int N, const IntView &r, + const IntView &c, const VectorViewType &D, const VectorViewType &X, const VectorViewType &B) { - Kokkos::Random_XorShift64_Pool< - typename VectorViewType::device_type::execution_space> - random(13718); - Kokkos::fill_random( - X, random, - Kokkos::reduction_identity::prod()); - Kokkos::fill_random( - B, random, - Kokkos::reduction_identity::prod()); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, Kokkos::reduction_identity::prod()); + Kokkos::fill_random(B, random, Kokkos::reduction_identity::prod()); auto D_host = Kokkos::create_mirror_view(D); auto r_host = Kokkos::create_mirror_view(r); diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp index b05f3db61f..3c0b194faf 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp @@ -28,8 +28,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamCG { -template +template struct Functor_TestBatchedTeamCG { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -40,32 +40,21 @@ struct Functor_TestBatchedTeamCG { const int _N_team; KrylovHandleType handle; - Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _N_team(N_team), - handle(KrylovHandleType(_D.extent(0), _N_team)) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; @@ -80,8 +69,7 @@ struct Functor_TestBatchedTeamCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); @@ -92,8 +80,7 @@ struct Functor_TestBatchedTeamCG { } }; -template +template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -118,8 +105,7 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -147,13 +133,11 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(D_host, D); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamCG(D, r, c, X, B, N_team) + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamCG(D, r, c, X, B, + N_team) .run(); Kokkos::fence(); @@ -163,16 +147,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); } } // namespace TeamCG } // namespace Test @@ -183,26 +164,21 @@ int test_batched_team_CG() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamCG::impl_test_batched_CG(1024, i, 2); + Test::TeamCG::impl_test_batched_CG(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamCG::impl_test_batched_CG(1024, i, 2); + Test::TeamCG::impl_test_batched_CG(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp index 1bdb6bc95a..9d51be581b 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_CG_float) { - test_batched_team_CG(); -} +TEST_F(TestCategory, batched_scalar_team_CG_float) { test_batched_team_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_CG_double) { - test_batched_team_CG(); -} +TEST_F(TestCategory, batched_scalar_team_CG_double) { test_batched_team_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp index de1a7f4fc2..e2250bab95 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp @@ -29,8 +29,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamGMRES { -template +template struct Functor_TestBatchedTeamGMRES { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -42,37 +42,23 @@ struct Functor_TestBatchedTeamGMRES { const int _N_team; KrylovHandleType _handle; - Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, - const VectorViewType &diag, const int N_team, + Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, + const VectorViewType &B, const VectorViewType &diag, const int N_team, KrylovHandleType &handle) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _Diag(diag), - _N_team(N_team), - _handle(handle) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team), _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto diag = Kokkos::subview( - _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; using PrecOperator = KokkosBatched::JacobiPrec; @@ -81,9 +67,7 @@ struct Functor_TestBatchedTeamGMRES { PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamGMRES::template invoke( - member, A, b, x, P, _handle); + KokkosBatched::TeamGMRES::template invoke(member, A, b, x, P, _handle); } inline void run() { @@ -92,8 +76,7 @@ struct Functor_TestBatchedTeamGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); @@ -103,8 +86,8 @@ struct Functor_TestBatchedTeamGMRES { _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); - _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( - "", N, maximum_iteration, n + maximum_iteration + 3); + _handle.Arnoldi_view = + typename KrylovHandleType::ArnoldiViewType("", N, maximum_iteration, n + maximum_iteration + 3); using ScalarType = typename ValuesViewType::non_const_value_type; using Layout = typename ValuesViewType::array_layout; @@ -122,16 +105,14 @@ struct Functor_TestBatchedTeamGMRES { size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -157,8 +138,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -177,12 +157,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { int current_index; for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = values_host(j, current_index); } Kokkos::deep_copy(Diag, diag_values_host); @@ -212,13 +190,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KrylovHandleType handle(N, N_team, n_iterations); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamGMRES( + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamGMRES( D, r, c, X, B, Diag, N_team, handle) .run(); @@ -229,17 +204,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e5 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK( - std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); } } // namespace TeamGMRES } // namespace Test @@ -250,26 +221,21 @@ int test_batched_team_GMRES() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); + Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); + Test::TeamGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp index f8aab13eec..3ca0466630 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_GMRES_float) { - test_batched_team_GMRES(); -} +TEST_F(TestCategory, batched_scalar_team_GMRES_float) { test_batched_team_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_GMRES_double) { - test_batched_team_GMRES(); -} +TEST_F(TestCategory, batched_scalar_team_GMRES_double) { test_batched_team_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp index a6c9ac7ea8..228bd01afa 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Spmv.hpp" #include "KokkosBatched_Spmv_Team_Impl.hpp" @@ -38,9 +38,8 @@ struct ParamTag { typedef T trans; }; -template +template struct Functor_TestBatchedTeamSpmv { using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; @@ -53,45 +52,27 @@ struct Functor_TestBatchedTeamSpmv { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamSpmv(const alphaViewType &alpha, - const ValuesViewType &D, const IntView &r, - const IntView &c, const xViewType &X, - const betaViewType &beta, const yViewType &Y, - const int N_team) - : _alpha(alpha), - _D(D), - _r(r), - _c(c), - _X(X), - _beta(beta), - _Y(Y), - _N_team(N_team) {} + Functor_TestBatchedTeamSpmv(const alphaViewType &alpha, const ValuesViewType &D, const IntView &r, const IntView &c, + const xViewType &X, const betaViewType &beta, const yViewType &Y, const int N_team) + : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y), _N_team(N_team) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto beta = - Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - KokkosBatched::TeamSpmv:: - template invoke( - member, alpha, d, _r, _c, x, beta, y); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto beta = Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + KokkosBatched::TeamSpmv::template invoke< + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(member, alpha, d, _r, _c, x, + beta, y); } inline void run() { @@ -100,16 +81,14 @@ struct Functor_TestBatchedTeamSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( - _D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -151,20 +130,15 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { else Y0_host(l, i) *= beta_host(l); if (i != 0 && i != (BlkSize - 1)) - Y0_host(l, i) += - alpha_host(l) * - (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); else if (i == 0) - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); else - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); } - Functor_TestBatchedTeamSpmv(alpha, D, r, c, X1, beta, Y1, N_team) + Functor_TestBatchedTeamSpmv(alpha, D, r, c, X1, beta, Y1, N_team) .run(); Kokkos::fence(); @@ -189,50 +163,38 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { } // namespace TeamSpmv } // namespace Test -template +template int test_batched_team_spmv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamSpmv::impl_test_batched_spmv( - 1024, i, 2); + Test::TeamSpmv::impl_test_batched_spmv(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp index 3ffd68209b..9ca4405b89 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp @@ -28,8 +28,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamVectorCG { -template +template struct Functor_TestBatchedTeamVectorCG { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -40,40 +40,27 @@ struct Functor_TestBatchedTeamVectorCG { const int _N_team; KrylovHandleType handle; - Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _N_team(N_team), - handle(KrylovHandleType(_D.extent(0), _N_team)) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; Operator A(d, _r, _c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, handle); } inline void run() { @@ -82,8 +69,7 @@ struct Functor_TestBatchedTeamVectorCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); @@ -94,8 +80,7 @@ struct Functor_TestBatchedTeamVectorCG { } }; -template +template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -120,8 +105,7 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -149,14 +133,11 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(D_host, D); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamVectorCG(D, r, c, X, - B, N_team) + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamVectorCG(D, r, c, X, B, + N_team) .run(); Kokkos::fence(); @@ -166,16 +147,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e3 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps); } } // namespace TeamVectorCG } // namespace Test @@ -186,26 +164,21 @@ int test_batched_teamvector_CG() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); + Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); + Test::TeamVectorCG::impl_test_batched_CG(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp index 859a1a885c..85935e07f3 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_teamvector_CG_float) { - test_batched_teamvector_CG(); -} +TEST_F(TestCategory, batched_scalar_teamvector_CG_float) { test_batched_teamvector_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_teamvector_CG_double) { - test_batched_teamvector_CG(); -} +TEST_F(TestCategory, batched_scalar_teamvector_CG_double) { test_batched_teamvector_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp index 084b623aa2..a14077f014 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp @@ -29,8 +29,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamVectorGMRES { -template +template struct Functor_TestBatchedTeamVectorGMRES { using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; @@ -42,37 +42,23 @@ struct Functor_TestBatchedTeamVectorGMRES { const int _N_team; KrylovHandleType _handle; - Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, - const VectorViewType &diag, + Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const VectorViewType &diag, const int N_team, KrylovHandleType &handle) - : _D(D), - _r(r), - _c(c), - _X(X), - _B(B), - _Diag(diag), - _N_team(N_team), - _handle(handle) {} + : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team), _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto diag = Kokkos::subview( - _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; using PrecOperator = KokkosBatched::JacobiPrec; @@ -81,9 +67,7 @@ struct Functor_TestBatchedTeamVectorGMRES { PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } inline void run() { @@ -92,8 +76,7 @@ struct Functor_TestBatchedTeamVectorGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); @@ -103,8 +86,8 @@ struct Functor_TestBatchedTeamVectorGMRES { _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); - _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( - "", N, maximum_iteration, n + maximum_iteration + 3); + _handle.Arnoldi_view = + typename KrylovHandleType::ArnoldiViewType("", N, maximum_iteration, n + maximum_iteration + 3); using ScalarType = typename ValuesViewType::non_const_value_type; using Layout = typename ValuesViewType::array_layout; @@ -122,16 +105,14 @@ struct Functor_TestBatchedTeamVectorGMRES { size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -157,8 +138,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KrylovHandle; + using KrylovHandleType = KrylovHandle; NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -177,12 +157,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { int current_index; for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = values_host(j, current_index); } Kokkos::deep_copy(Diag, diag_values_host); @@ -212,13 +190,10 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KrylovHandleType handle(N, N_team, n_iterations); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_0_host); - Functor_TestBatchedTeamVectorGMRES( + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); + Functor_TestBatchedTeamVectorGMRES( D, r, c, X, B, Diag, N_team, handle) .run(); @@ -229,17 +204,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(X_host, X); KokkosBatched::SerialSpmv::template invoke< - typename ValuesViewType::HostMirror, typename IntView::HostMirror, - typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, - 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); - KokkosBatched::SerialDot::invoke(R_host, R_host, - sqr_norm_j_host); + typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, + typename VectorViewType::HostMirror, 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_j_host); const MagnitudeType eps = 1.0e5 * ats::epsilon(); - for (int l = 0; l < N; ++l) - EXPECT_NEAR_KK( - std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); } } // namespace TeamVectorGMRES } // namespace Test @@ -250,28 +221,21 @@ int test_batched_teamvector_GMRES() { { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGMRES::impl_test_batched_GMRES( - 1024, i, 2); + Test::TeamVectorGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - VectorViewType; + typedef Kokkos::View VectorViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorGMRES::impl_test_batched_GMRES( - 1024, i, 2); + Test::TeamVectorGMRES::impl_test_batched_GMRES(1024, i, 2); } } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp index 53b740deaa..ab889844a9 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp @@ -15,13 +15,9 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_teamvector_GMRES_float) { - test_batched_teamvector_GMRES(); -} +TEST_F(TestCategory, batched_scalar_teamvector_GMRES_float) { test_batched_teamvector_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_teamvector_GMRES_double) { - test_batched_teamvector_GMRES(); -} +TEST_F(TestCategory, batched_scalar_teamvector_GMRES_double) { test_batched_teamvector_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp index 9cbba56370..83a78228b3 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp @@ -19,7 +19,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -//#include "KokkosBatched_Vector.hpp" +// #include "KokkosBatched_Vector.hpp" #include "KokkosBatched_Spmv.hpp" #include "KokkosBatched_Spmv_TeamVector_Impl.hpp" @@ -38,9 +38,8 @@ struct ParamTag { typedef T trans; }; -template +template struct Functor_TestBatchedTeamVectorSpmv { using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; @@ -53,52 +52,33 @@ struct Functor_TestBatchedTeamVectorSpmv { const int _N_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSpmv(const alphaViewType &alpha, - const ValuesViewType &D, const IntView &r, - const IntView &c, const xViewType &X, - const betaViewType &beta, - const yViewType &Y, const int N_team) - : _alpha(alpha), - _D(D), - _r(r), - _c(c), - _X(X), - _beta(beta), - _Y(Y), - _N_team(N_team) {} + Functor_TestBatchedTeamVectorSpmv(const alphaViewType &alpha, const ValuesViewType &D, const IntView &r, + const IntView &c, const xViewType &X, const betaViewType &beta, const yViewType &Y, + const int N_team) + : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y), _N_team(N_team) {} template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, const MemberType &member) const { const int first_matrix = static_cast(member.league_rank()) * _N_team; const int N = _D.extent(0); const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); - - auto alpha = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto beta = - Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); - auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + (static_cast(member.league_rank() + 1) * _N_team < N ? static_cast(member.league_rank() + 1) * _N_team + : N); + + auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto beta = Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); + auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); if (last_matrix != N) - KokkosBatched::TeamVectorSpmv< - MemberType, typename ParamTagType::trans, - 2>::template invoke( - member, alpha, d, _r, _c, x, beta, y); + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(member, alpha, d, _r, _c, + x, beta, y); else - KokkosBatched::TeamVectorSpmv:: - template invoke( - member, alpha, d, _r, _c, x, beta, y); + KokkosBatched::TeamVectorSpmv::template invoke< + ValuesViewType, IntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>(member, alpha, d, _r, _c, + x, beta, y); } inline void run() { @@ -107,17 +87,15 @@ struct Functor_TestBatchedTeamVectorSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( - ceil(static_cast(_D.extent(0)) / _N_team), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(ceil(static_cast(_D.extent(0)) / _N_team), + Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; -template +template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; typedef Kokkos::ArithTraits ats; @@ -159,21 +137,15 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { else Y0_host(l, i) *= beta_host(l); if (i != 0 && i != (BlkSize - 1)) - Y0_host(l, i) += - alpha_host(l) * - (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1)); else if (i == 0) - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1)); else - Y0_host(l, i) += - alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); + Y0_host(l, i) += alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1)); } - Functor_TestBatchedTeamVectorSpmv( - alpha, D, r, c, X1, beta, Y1, N_team) + Functor_TestBatchedTeamVectorSpmv(alpha, D, r, c, X1, beta, Y1, N_team) .run(); Kokkos::fence(); @@ -198,45 +170,37 @@ void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { } // namespace TeamVectorSpmv } // namespace Test -template +template int test_batched_teamvector_spmv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 0>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 1>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; typedef Kokkos::View IntView; - typedef Kokkos::View - alphaViewType; + typedef Kokkos::View alphaViewType; for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 0>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } for (int i = 3; i < 10; ++i) { - Test::TeamVectorSpmv::impl_test_batched_spmv< - DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType, - alphaViewType, alphaViewType, 1>(1025, i, 2); + Test::TeamVectorSpmv::impl_test_batched_spmv(1025, i, 2); } } #endif diff --git a/blas/impl/KokkosBlas1_abs_impl.hpp b/blas/impl/KokkosBlas1_abs_impl.hpp index 0334adbafe..0c674f25f5 100644 --- a/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/blas/impl/KokkosBlas1_abs_impl.hpp @@ -37,8 +37,7 @@ struct MV_Abs_Functor { RMV R_; XMV X_; - MV_Abs_Functor(const RMV& R, const XMV& X) - : numCols(X.extent(1)), R_(R), X_(X) { + MV_Abs_Functor(const RMV& R, const XMV& X) : numCols(X.extent(1)), R_(R), X_(X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Abs_Functor: RMV is not a Kokkos::View."); @@ -163,8 +162,7 @@ void MV_Abs_Generic(const execution_space& space, const RMV& R, const XMV& X) { const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if ((void*)(R.data()) == - (void*)(X.data())) { // if R and X are the same (alias one another) + if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) MV_AbsSelf_Functor op(R); Kokkos::parallel_for("KokkosBlas::Abs::S0", policy, op); } else { @@ -192,8 +190,7 @@ void V_Abs_Generic(const execution_space& space, const RV& R, const XV& X) { const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if ((void*)(R.data()) == - (void*)(X.data())) { // if R and X are the same (alias one another) + if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) V_AbsSelf_Functor op(R); Kokkos::parallel_for("KokkosBlas::Abs::S2", policy, op); } else { diff --git a/blas/impl/KokkosBlas1_abs_spec.hpp b/blas/impl/KokkosBlas1_abs_spec.hpp index a4695bd505..fb6357b38e 100644 --- a/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_abs_spec.hpp @@ -42,17 +42,15 @@ struct abs_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct abs_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct abs_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -62,18 +60,15 @@ struct abs_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct abs_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct abs_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -85,10 +80,9 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = RMV::rank, - bool tpl_spec_avail = abs_tpl_spec_avail::value, - bool eti_spec_avail = abs_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = abs_eti_spec_avail::value> struct Abs { static void abs(const execution_space& space, const RMV& R, const XMV& X); }; @@ -96,8 +90,7 @@ struct Abs { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Abs for single vectors (1-D Views). template -struct Abs { +struct Abs { using size_type = typename XMV::size_type; static void abs(const execution_space& space, const RMV& R, const XMV& X) { @@ -113,16 +106,13 @@ struct Abs: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::abs[ETI]" - : "KokkosBlas::abs[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::abs[ETI]" + : "KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::abs<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::abs<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -139,8 +129,7 @@ struct Abs -struct Abs { +struct Abs { using size_type = typename XMV::size_type; static void abs(const execution_space& space, const RMV& R, const XMV& X) { @@ -156,23 +145,19 @@ struct Abs: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::abs[ETI]" - : "KokkosBlas::abs[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::abs[ETI]" + : "KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; MV_Abs_Generic(space, R, X); } else { @@ -194,14 +179,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -209,14 +192,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -226,15 +207,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -242,15 +220,12 @@ struct Abs, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Abs< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index b919d76a94..6baed662cf 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -25,14 +25,12 @@ namespace KokkosBlas { namespace Impl { template -constexpr typename std::enable_if, int>::type -axpbyVarExtent(T& v) { +constexpr typename std::enable_if, int>::type axpbyVarExtent(T& v) { return v.extent(0); } template -constexpr typename std::enable_if, int>::type -axpbyVarExtent(T&) { +constexpr typename std::enable_if, int>::type axpbyVarExtent(T&) { return 0; } @@ -58,8 +56,7 @@ axpbyVarExtent(T&) { // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. -template +template struct Axpby_Functor { typedef typename YV::execution_space execution_space; typedef SizeType size_type; @@ -70,8 +67,7 @@ struct Axpby_Functor { AV m_a; BV m_b; - Axpby_Functor(const XV& x, const YV& y, const AV& av, const BV& bv, - const SizeType startingColumn) + Axpby_Functor(const XV& x, const YV& y, const AV& av, const BV& bv, const SizeType startingColumn) : m_x(x), m_y(y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" @@ -79,8 +75,7 @@ struct Axpby_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -90,18 +85,15 @@ struct Axpby_Functor { static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": XV and YV must have rank 1."); - static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2), + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2), "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { if (axpbyVarExtent(m_a) > 1) { - m_a = Kokkos::subview( - av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + m_a = Kokkos::subview(av, std::make_pair(startingColumn, SizeType(av.extent(0)))); } if (axpbyVarExtent(m_b) > 1) { - m_b = Kokkos::subview( - bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + m_b = Kokkos::subview(bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); } } } @@ -123,10 +115,8 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { // Nothing to do: m_y(i) = m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { - m_y(i) = - Kokkos::ArithTraits::zero(); + if (m_b(0) == Kokkos::ArithTraits::zero()) { + m_y(i) = Kokkos::ArithTraits::zero(); } else { m_y(i) = m_b(0) * m_y(i); } @@ -143,8 +133,7 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = -m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { m_y(i) = -m_x(i); } else { m_y(i) = -m_x(i) + m_b(0) * m_y(i); @@ -162,8 +151,7 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { m_y(i) = m_x(i); } else { m_y(i) = m_x(i) + m_b(0) * m_y(i); @@ -181,8 +169,7 @@ struct Axpby_Functor { } else if constexpr (scalar_y == 1) { m_y(i) = m_a(0) * m_x(i) + m_y(i); } else if constexpr (scalar_y == 2) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { m_y(i) = m_a(0) * m_x(i); } else { m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); @@ -209,8 +196,7 @@ struct Axpby_Functor { // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. template -struct Axpby_Functor { typedef typename YV::execution_space execution_space; typedef SizeType size_type; @@ -221,10 +207,8 @@ struct Axpby_Functor::value, "KokkosBlas::Impl::Axpby_Functor(ABscalars)" @@ -232,8 +216,7 @@ struct Axpby_Functor::value, "KokkosBlas::Impl::Axpby_Functor(ABscalars)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_Functor(ABscalars)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -243,8 +226,7 @@ struct Axpby_Functor -void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, - const BV& bv, const YV& y, const SizeType startingColumn, - int scalar_x = 2, int scalar_y = 2) { +template +void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, const BV& bv, const YV& y, + const SizeType startingColumn, int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_Generic: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -344,8 +323,7 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, "KokkosBlas::Impl::Axpby_Generic: " "XV and YV must have rank 1."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -361,20 +339,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** if (scalar_x == 0) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op); } } @@ -383,20 +357,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** else if (scalar_x == -1) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S4", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S5", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op); } } @@ -405,20 +375,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** else if (scalar_x == 1) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S8", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S9", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op); } } @@ -427,20 +393,16 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, // **************************************************************** else if (scalar_x == 2) { if (scalar_y == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op); } else if (scalar_y == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op); } else if (scalar_y == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op); } else if (scalar_y == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); + Axpby_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op); } } diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 7db7b0abe3..81c05fe7df 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -66,8 +66,7 @@ struct Axpby_MV_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -83,8 +82,7 @@ struct Axpby_MV_Functor { static_assert(BV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": BV must have rank 1."); - static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2), + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2), "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" ": scalar_x and/or scalar_y are out of range."); } @@ -123,8 +121,7 @@ struct Axpby_MV_Functor { // Nothing to do: Y(i,j) := Y(i,j) } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -132,8 +129,7 @@ struct Axpby_MV_Functor { #pragma vector always #endif for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = Kokkos::ArithTraits< - typename YMV::non_const_value_type>::zero(); + m_y(i, k) = Kokkos::ArithTraits::zero(); } } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP @@ -195,8 +191,7 @@ struct Axpby_MV_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -266,8 +261,7 @@ struct Axpby_MV_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -374,8 +368,7 @@ struct Axpby_MV_Functor { } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -409,8 +402,7 @@ struct Axpby_MV_Functor { } } else { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif @@ -467,8 +459,7 @@ struct Axpby_MV_Functor { // This version works by partial specialization on AV and BV. // In this partial specialization, both AV and BV are scalars. template -struct Axpby_MV_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -479,8 +470,7 @@ struct Axpby_MV_Functor::value, @@ -489,8 +479,7 @@ struct Axpby_MV_Functor::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -500,8 +489,7 @@ struct Axpby_MV_Functor +template struct Axpby_MV_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -704,8 +691,7 @@ struct Axpby_MV_Unroll_Functor { AV m_a; BV m_b; - Axpby_MV_Unroll_Functor(const XMV& x, const YMV& y, const AV& av, - const BV& bv, const SizeType startingColumn) + Axpby_MV_Unroll_Functor(const XMV& x, const YMV& y, const AV& av, const BV& bv, const SizeType startingColumn) : m_x(x), m_y(y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" @@ -719,8 +705,7 @@ struct Axpby_MV_Unroll_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -736,19 +721,16 @@ struct Axpby_MV_Unroll_Functor { static_assert(BV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": BV must have rank 1."); - static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2), + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2), "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { if (axpbyVarExtent(m_a) > 1) { - m_a = Kokkos::subview( - av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + m_a = Kokkos::subview(av, std::make_pair(startingColumn, SizeType(av.extent(0)))); } if (axpbyVarExtent(m_b) > 1) { - m_b = Kokkos::subview( - bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + m_b = Kokkos::subview(bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); } } } @@ -781,14 +763,12 @@ struct Axpby_MV_Unroll_Functor { // Nothing to do: Y(i,j) := Y(i,j) } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = Kokkos::ArithTraits< - typename YMV::non_const_value_type>::zero(); + m_y(i, k) = Kokkos::ArithTraits::zero(); } } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -835,8 +815,7 @@ struct Axpby_MV_Unroll_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -888,8 +867,7 @@ struct Axpby_MV_Unroll_Functor { } } else if constexpr (scalar_y == 2) { if (m_b.extent(0) == 1) { - if (m_b(0) == - Kokkos::ArithTraits::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -969,8 +947,7 @@ struct Axpby_MV_Unroll_Functor { } else if constexpr (scalar_y == 2) { if (m_a.extent(0) == 1) { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -995,8 +972,7 @@ struct Axpby_MV_Unroll_Functor { } } else { if (m_b.extent(0) == 1) { - if (m_b(0) == Kokkos::ArithTraits< - typename BV::non_const_value_type>::zero()) { + if (m_b(0) == Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif @@ -1028,10 +1004,8 @@ struct Axpby_MV_Unroll_Functor { // Variant of Axpby_MV_Unroll_Functor for single coefficients (rather // than vectors of coefficients) a and b. The number of columns in X // and Y, UNROLL, is a compile-time constant. -template -struct Axpby_MV_Unroll_Functor +struct Axpby_MV_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -1041,10 +1015,8 @@ struct Axpby_MV_Unroll_Functor::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" @@ -1052,8 +1024,7 @@ struct Axpby_MV_Unroll_Functor::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1063,8 +1034,7 @@ struct Axpby_MV_Unroll_Functor -void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, - const SizeType startingColumn, int scalar_x = 2, - int scalar_y = 2) { +template +void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + const SizeType startingColumn, int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1251,8 +1217,7 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Unrolled()" ": XMV and YMV must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1268,20 +1233,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** if (scalar_x == 0) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op); } } @@ -1290,20 +1251,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** else if (scalar_x == -1) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S4", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S5", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op); } } @@ -1312,20 +1269,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** else if (scalar_x == 1) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S8", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S9", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op); } } @@ -1334,20 +1287,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // **************************************************************** else if (scalar_x == 2) { if (scalar_y == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op); } else if (scalar_y == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op); } else if (scalar_y == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op); } else if (scalar_y == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); + Axpby_MV_Unroll_Functor op(x, y, av, bv, startingColumn); Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op); } } @@ -1372,19 +1321,16 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template -void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int scalar_x = 2, - int scalar_y = 2) { +template +void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Generic()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Generic()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Generic()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1394,8 +1340,7 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Generic()" ": XMV and YMV must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1499,20 +1444,17 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Left { - static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int scalar_x = 2, - int scalar_y = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1522,8 +1464,7 @@ struct Axpby_MV_Invoke_Left { static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" ": X and Y must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1544,8 +1485,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); + Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, scalar_x, + scalar_y); } for (; j + 4 <= numCols; j += 4) { XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 4)); @@ -1554,8 +1495,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); + Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, scalar_x, + scalar_y); } for (; j < numCols; ++j) { auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j); @@ -1566,8 +1507,7 @@ struct Axpby_MV_Invoke_Left { // the functor doesn't have to do anything to them. typedef decltype(x_cur) XV; typedef decltype(y_cur) YV; - Axpby_Generic( - space, av, x_cur, bv, y_cur, j, scalar_x, scalar_y); + Axpby_Generic(space, av, x_cur, bv, y_cur, j, scalar_x, scalar_y); } } }; @@ -1591,20 +1531,17 @@ struct Axpby_MV_Invoke_Left { // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Right { - static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int scalar_x = 2, - int scalar_y = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": Y must be nonconst, since it is an output argument" " and we have to be able to write to its entries."); @@ -1614,8 +1551,7 @@ struct Axpby_MV_Invoke_Right { static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" ": X and Y must have rank 2."); - if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && - (scalar_y <= 2)) { + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && (scalar_y <= 2)) { // Ok } else { KokkosKernels::Impl::throw_runtime_exception( @@ -1629,11 +1565,9 @@ struct Axpby_MV_Invoke_Right { auto y_0 = Kokkos::subview(y, Kokkos::ALL(), 0); typedef decltype(x_0) XV; typedef decltype(y_0) YV; - Axpby_Generic( - space, av, x_0, bv, y_0, 0, scalar_x, scalar_y); + Axpby_Generic(space, av, x_0, bv, y_0, 0, scalar_x, scalar_y); } else { - Axpby_MV_Generic( - space, av, x, bv, y, scalar_x, scalar_y); + Axpby_MV_Generic(space, av, x, bv, y, scalar_x, scalar_y); } } }; diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index 3aff21e0be..f4f85c8f6b 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -28,8 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_eti_spec_avail { enum : bool { value = false }; }; @@ -43,36 +42,29 @@ struct axpby_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_AXPBY_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -82,36 +74,29 @@ struct axpby_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct axpby_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -146,21 +131,16 @@ namespace Impl { /// Any scalar coefficient of zero has BLAS semantics of /// ignoring the corresponding (multi)vector entry. This does NOT /// apply to coefficients in av and bv vectors, if they are used. -template ::value, - bool eti_spec_avail = - axpby_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = axpby_eti_spec_avail::value> struct Axpby { - static void axpby(const execution_space& space, const AV& av, const XMV& X, - const BV& bv, const YMV& Y); + static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y); }; template struct Axpby { - static void axpby(const execution_space& /*space*/, const AV& /* av */, - const XMV& /* X */, const BV& /* bv */, + static void axpby(const execution_space& /*space*/, const AV& /* av */, const XMV& /* X */, const BV& /* bv */, const YMV& /* Y */) { static_assert(YMV::rank == 0, "Oh My God"); } @@ -175,20 +155,17 @@ struct Axpby { // the unification process forces AV = view and BV = view // ********************************************************************** template -struct Axpby { +struct Axpby { using size_type = typename YMV::size_type; - static void axpby(const execution_space& space, const AV& av, const XMV& X, - const BV& bv, const YMV& Y) { + static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -198,21 +175,17 @@ struct Axpby::axpby: " "X and Y must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::axpby[ETI]" - : "KokkosBlas::axpby[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf( - "KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", typeid(AV).name(), + typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); else { printf( "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); } #endif @@ -255,22 +228,19 @@ struct Axpby(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - using index_type = int; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + using index_type = int; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } else { - using index_type = typename XMV::size_type; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); @@ -285,25 +255,22 @@ struct Axpby -struct Axpby { +struct Axpby { using AV = typename XMV::non_const_value_type; using BV = typename YMV::non_const_value_type; using size_type = typename YMV::size_type; using ATA = Kokkos::ArithTraits; using ATB = Kokkos::ArithTraits; - static void axpby(const execution_space& space, const AV& alpha, const XMV& X, - const BV& beta, const YMV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XMV& X, const BV& beta, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby::axpby (MV): " "X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby::axpby (MV): " "Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby::axpby (MV): Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -313,22 +280,18 @@ struct Axpby ETI specialization for < %s , %s , %s , %s >\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", typeid(AV).name(), + typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); else { printf( "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); } #endif @@ -353,22 +316,19 @@ struct Axpby(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - using index_type = int; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + using index_type = int; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } else { - using index_type = typename XMV::size_type; - using Axpby_MV_Invoke_Layout = typename std::conditional< - std::is_same::value, - Axpby_MV_Invoke_Left, - Axpby_MV_Invoke_Right >::type; + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = + typename std::conditional::value, + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); @@ -383,15 +343,12 @@ struct Axpby -struct Axpby { +struct Axpby { using size_type = typename YV::size_type; - static void axpby(const execution_space& space, const AV& av, const XV& X, - const BV& bv, const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::axpby[ETI]" - : "KokkosBlas::axpby[noETI]"); + static void axpby(const execution_space& space, const AV& av, const XV& X, const BV& bv, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); size_type const numRows = X.extent(0); @@ -433,12 +390,10 @@ struct Axpby(INT_MAX)) { using index_type = int; - Axpby_Generic( - space, av, X, bv, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, av, X, bv, Y, 0, scalar_x, scalar_y); } else { using index_type = typename XV::size_type; - Axpby_Generic( - space, av, X, bv, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, av, X, bv, Y, 0, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); @@ -453,8 +408,7 @@ struct Axpby -struct Axpby { using AV = typename XV::non_const_value_type; using BV = typename YV::non_const_value_type; @@ -462,16 +416,14 @@ struct Axpby; using ATB = Kokkos::ArithTraits; - static void axpby(const execution_space& space, const AV& alpha, const XV& X, - const BV& beta, const YV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: Y is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -482,21 +434,17 @@ struct Axpby::axpby: " "X and Y must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::axpby[ETI]" - : "KokkosBlas::axpby[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf( - "KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", - typeid(AV).name(), typeid(XV).name(), typeid(BV).name(), - typeid(YV).name()); + printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n", typeid(AV).name(), + typeid(XV).name(), typeid(BV).name(), typeid(YV).name()); else { printf( "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XV).name(), typeid(BV).name(), - typeid(YV).name()); + typeid(AV).name(), typeid(XV).name(), typeid(BV).name(), typeid(YV).name()); } #endif @@ -522,14 +470,12 @@ struct Axpby(INT_MAX)) { using index_type = int; - Axpby_Generic( - space, alpha, X, beta, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } else { using index_type = typename XV::size_type; - Axpby_Generic( - space, alpha, X, beta, Y, 0, scalar_x, scalar_y); + Axpby_Generic(space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } @@ -548,54 +494,42 @@ struct Axpby, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - extern template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1, false, true>; \ + extern template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Axpby< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1, false, true>; \ + template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -606,56 +540,42 @@ struct Axpby, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - extern template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2, false, true>; \ + extern template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Axpby< \ - EXEC_SPACE, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - template struct Axpby< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Axpby< \ + EXEC_SPACE, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2, false, true>; \ + template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 9d200e892d..0a03007801 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -53,8 +53,7 @@ constexpr typename std::enable_if, bool>::type Tr1s_val() { } template -constexpr typename std::enable_if, bool>::type -Tr1s_val() { +constexpr typename std::enable_if, bool>::type Tr1s_val() { return false; } @@ -66,8 +65,7 @@ constexpr typename std::enable_if, bool>::type Tr1d_val() { } template -constexpr typename std::enable_if, bool>::type -Tr1d_val() { +constexpr typename std::enable_if, bool>::type Tr1d_val() { return false; } @@ -105,8 +103,7 @@ struct AxpbyUnificationAttemptTraits { // - type names begin with upper case letters // ******************************************************************** public: - static constexpr bool onDevice = - KokkosKernels::Impl::kk_is_gpu_exec_space(); + static constexpr bool onDevice = KokkosKernels::Impl::kk_is_gpu_exec_space(); private: static constexpr bool onHost = !onDevice; @@ -139,23 +136,15 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'AtInputScalarTypeA_nonConst' // ******************************************************************** - using ScalarTypeA2_onDevice = - typename getScalarTypeFromView::type; - using ScalarTypeA1_onDevice = - std::conditional_t; + using ScalarTypeA2_onDevice = typename getScalarTypeFromView::type; + using ScalarTypeA1_onDevice = std::conditional_t; - using ScalarTypeA2_onHost = - typename getScalarTypeFromView::type; - using ScalarTypeA1_onHost = - std::conditional_t; + using ScalarTypeA2_onHost = typename getScalarTypeFromView::type; + using ScalarTypeA1_onHost = std::conditional_t; - using AtInputScalarTypeA = - std::conditional_t; + using AtInputScalarTypeA = std::conditional_t; - using AtInputScalarTypeA_nonConst = - typename std::remove_const::type; + using AtInputScalarTypeA_nonConst = typename std::remove_const::type; // ******************************************************************** // Declare 'AtInputScalarTypeX_nonConst' @@ -167,23 +156,15 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'AtInputScalarTypeB_nonConst' // ******************************************************************** - using ScalarTypeB2_onDevice = - typename getScalarTypeFromView::type; - using ScalarTypeB1_onDevice = - std::conditional_t; + using ScalarTypeB2_onDevice = typename getScalarTypeFromView::type; + using ScalarTypeB1_onDevice = std::conditional_t; - using ScalarTypeB2_onHost = - typename getScalarTypeFromView::type; - using ScalarTypeB1_onHost = - std::conditional_t; + using ScalarTypeB2_onHost = typename getScalarTypeFromView::type; + using ScalarTypeB1_onHost = std::conditional_t; - using AtInputScalarTypeB = - std::conditional_t; + using AtInputScalarTypeB = std::conditional_t; - using AtInputScalarTypeB_nonConst = - typename std::remove_const::type; + using AtInputScalarTypeB_nonConst = typename std::remove_const::type; // ******************************************************************** // Declare 'AtInputScalarTypeY_nonConst' @@ -195,138 +176,115 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Declare 'InternalLayoutX' and 'InternalLayoutY' // ******************************************************************** - using InternalLayoutX = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using InternalLayoutY = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - YMV, InternalLayoutX>::array_layout; + using InternalLayoutX = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using InternalLayoutY = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // ******************************************************************** // Declare 'InternalTypeA_tmp' // ******************************************************************** - using AtInputLayoutA = - typename getLayoutFromView::type; + using AtInputLayoutA = typename getLayoutFromView::type; public: - static constexpr bool atInputLayoutA_isStride = - std::is_same_v; + static constexpr bool atInputLayoutA_isStride = std::is_same_v; private: using InternalLayoutA = - std::conditional_t<(a_is_r1d || a_is_r1s) && atInputLayoutA_isStride, - AtInputLayoutA, InternalLayoutX>; - - static constexpr bool atInputScalarTypeA_mustRemain = - Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex; - - using InternalScalarTypeA = std::conditional_t< - atInputScalarTypeA_mustRemain || ((a_is_r1d || a_is_r1s) && xyRank2Case), - AtInputScalarTypeA_nonConst // Yes, keep the input scalar type - , - AtInputScalarTypeX_nonConst // Yes, instead of - // 'AtInputScalarTypeA_nonConst' - >; - - using InternalTypeA_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Keep 'a' as scalar - InternalScalarTypeA, - Kokkos::View>>; - - using InternalTypeA_onHost = std::conditional_t< - (a_is_r1d || a_is_r1s) && xyRank2Case && onHost, - Kokkos::View>, - InternalScalarTypeA>; - - using InternalTypeA_tmp = - std::conditional_t; + std::conditional_t<(a_is_r1d || a_is_r1s) && atInputLayoutA_isStride, AtInputLayoutA, InternalLayoutX>; + + static constexpr bool atInputScalarTypeA_mustRemain = Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; + + using InternalScalarTypeA = + std::conditional_t; + + using InternalTypeA_onDevice = + std::conditional_t>>; + + using InternalTypeA_onHost = + std::conditional_t<(a_is_r1d || a_is_r1s) && xyRank2Case && onHost, + Kokkos::View>, + InternalScalarTypeA>; + + using InternalTypeA_tmp = std::conditional_t; // ******************************************************************** // Declare 'InternalTypeX' // ******************************************************************** public: - using InternalTypeX = std::conditional_t< - x_is_r2, - Kokkos::View>, - Kokkos::View>>; + using InternalTypeX = + std::conditional_t>, + Kokkos::View>>; // ******************************************************************** // Declare 'InternalTypeB_tmp' // ******************************************************************** private: - using AtInputLayoutB = - typename getLayoutFromView::type; + using AtInputLayoutB = typename getLayoutFromView::type; public: - static constexpr bool atInputLayoutB_isStride = - std::is_same_v; + static constexpr bool atInputLayoutB_isStride = std::is_same_v; private: using InternalLayoutB = - std::conditional_t<(b_is_r1d || b_is_r1s) && atInputLayoutB_isStride, - AtInputLayoutB, InternalLayoutY>; - - static constexpr bool atInputScalarTypeB_mustRemain = - Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex; - - using InternalScalarTypeB = std::conditional_t< - atInputScalarTypeB_mustRemain || ((b_is_r1d || b_is_r1s) && xyRank2Case), - AtInputScalarTypeB_nonConst // Yes, keep the input scalar type - , - AtInputScalarTypeY_nonConst // Yes, instead of - // 'AtInputScalarTypeB_nonConst' - >; - - using InternalTypeB_onDevice = std::conditional_t< - a_is_scalar && b_is_scalar && onDevice, // Keep 'b' as scalar - InternalScalarTypeB, - Kokkos::View>>; - - using InternalTypeB_onHost = std::conditional_t< - (b_is_r1d || b_is_r1s) && xyRank2Case && onHost, - Kokkos::View>, - InternalScalarTypeB>; - - using InternalTypeB_tmp = - std::conditional_t; + std::conditional_t<(b_is_r1d || b_is_r1s) && atInputLayoutB_isStride, AtInputLayoutB, InternalLayoutY>; + + static constexpr bool atInputScalarTypeB_mustRemain = Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; + + using InternalScalarTypeB = + std::conditional_t; + + using InternalTypeB_onDevice = + std::conditional_t>>; + + using InternalTypeB_onHost = + std::conditional_t<(b_is_r1d || b_is_r1s) && xyRank2Case && onHost, + Kokkos::View>, + InternalScalarTypeB>; + + using InternalTypeB_tmp = std::conditional_t; // ******************************************************************** // Declare 'InternalTypeY' // ******************************************************************** public: - using InternalTypeY = std::conditional_t< - y_is_r2, - Kokkos::View>, - Kokkos::View>>; + using InternalTypeY = + std::conditional_t>, + Kokkos::View>>; // ******************************************************************** // Declare 'InternalTypeA': if 'InternalTypeB_tmp' is a view then // make sure 'InternalTypeA' is a view as well // ******************************************************************** - using InternalTypeA = std::conditional_t< - !Kokkos::is_view_v && - Kokkos::is_view_v, - Kokkos::View>, - InternalTypeA_tmp>; + using InternalTypeA = + std::conditional_t && Kokkos::is_view_v, + Kokkos::View>, + InternalTypeA_tmp>; // ******************************************************************** // Declare 'InternalTypeA_managed' with the same scalar type in @@ -336,23 +294,19 @@ struct AxpbyUnificationAttemptTraits { using InternalLayoutA_managed = InternalLayoutA; public: - using InternalTypeA_managed = std::conditional_t< - Kokkos::is_view_v, - Kokkos::View, - void>; + using InternalTypeA_managed = + std::conditional_t, + Kokkos::View, void>; // ******************************************************************** // Declare 'InternalTypeB' if 'InternalTypeA_tmp' is a view then // make sure 'InternalTypeB' is a view as well // ******************************************************************** - using InternalTypeB = std::conditional_t< - Kokkos::is_view_v && - !Kokkos::is_view_v, - Kokkos::View>, - InternalTypeB_tmp>; + using InternalTypeB = + std::conditional_t && !Kokkos::is_view_v, + Kokkos::View>, + InternalTypeB_tmp>; // ******************************************************************** // Declare 'InternalTypeB_managed' with the same scalar type in @@ -362,91 +316,72 @@ struct AxpbyUnificationAttemptTraits { using InternalLayoutB_managed = InternalLayoutB; public: - using InternalTypeB_managed = std::conditional_t< - Kokkos::is_view_v, - Kokkos::View, - void>; + using InternalTypeB_managed = + std::conditional_t, + Kokkos::View, void>; // ******************************************************************** // Auxiliary Boolean results on internal types // ******************************************************************** private: - static constexpr bool internalTypeA_is_scalar = - !Kokkos::is_view_v; - static constexpr bool internalTypeA_is_r1d = Tr1d_val(); + static constexpr bool internalTypeA_is_scalar = !Kokkos::is_view_v; + static constexpr bool internalTypeA_is_r1d = Tr1d_val(); - static constexpr bool internalTypeB_is_scalar = - !Kokkos::is_view_v; - static constexpr bool internalTypeB_is_r1d = Tr1d_val(); + static constexpr bool internalTypeB_is_scalar = !Kokkos::is_view_v; + static constexpr bool internalTypeB_is_r1d = Tr1d_val(); public: - static constexpr bool internalTypesAB_bothScalars = - (internalTypeA_is_scalar && internalTypeB_is_scalar); - static constexpr bool internalTypesAB_bothViews = - (internalTypeA_is_r1d && internalTypeB_is_r1d); + static constexpr bool internalTypesAB_bothScalars = (internalTypeA_is_scalar && internalTypeB_is_scalar); + static constexpr bool internalTypesAB_bothViews = (internalTypeA_is_r1d && internalTypeB_is_r1d); // ******************************************************************** // Routine to perform checks (both compile time and run time) // ******************************************************************** - static void performChecks(const AV& a, const XMV& X, const BV& b, - const YMV& Y) { + static void performChecks(const AV& a, const XMV& X, const BV& b, const YMV& Y) { // ****************************************************************** // Check 1/6: General checks // ****************************************************************** - static_assert( - Kokkos::is_execution_space_v, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": tExecSpace must be a valid Kokkos execution space."); - - static_assert( - (xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": one must have either both X and Y as rank 1, or both X and Y as " - "rank 2"); - - if constexpr (!Kokkos::ArithTraits< - AtInputScalarTypeY_nonConst>::is_complex) { - static_assert( - (!Kokkos::ArithTraits::is_complex) && - (!Kokkos::ArithTraits::is_complex) && - (!Kokkos::ArithTraits::is_complex), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": if Y is not complex, then A, X and B cannot be complex"); + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": tExecSpace must be a valid Kokkos execution space."); + + static_assert((xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": one must have either both X and Y as rank 1, or both X and Y as " + "rank 2"); + + if constexpr (!Kokkos::ArithTraits::is_complex) { + static_assert((!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": if Y is not complex, then A, X and B cannot be complex"); } // ****************************************************************** // Check 2/6: YMV is valid // ****************************************************************** - static_assert( - Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": Y is not a Kokkos::View."); - static_assert( - std::is_same::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": Y is const. It must be nonconst, " - "because it is an output argument " - "(we must be able to write to its entries)."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": XMV must be accessible from tExecSpace"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is not a Kokkos::View."); + static_assert(std::is_same::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is const. It must be nonconst, " + "because it is an output argument " + "(we must be able to write to its entries)."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": XMV must be accessible from tExecSpace"); // ****************************************************************** // Check 3/6: XMV is valid // ****************************************************************** - static_assert( - Kokkos::is_view::value, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ": XMV must be accessible from tExecSpace"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": XMV must be accessible from tExecSpace"); if constexpr (xyRank1Case) { if (X.extent(0) != Y.extent(0)) { @@ -454,8 +389,7 @@ struct AxpbyUnificationAttemptTraits { msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" ")" << ", invalid rank-1 X extent" - << ": X.extent(0) = " << X.extent(0) - << ", Y.extent(0) = " << Y.extent(0); + << ": X.extent(0) = " << X.extent(0) << ", Y.extent(0) = " << Y.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { @@ -464,10 +398,8 @@ struct AxpbyUnificationAttemptTraits { msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" ")" << ", invalid rank-2 X extents" - << ": X.extent(0) = " << X.extent(0) - << ", X.extent(1) = " << X.extent(1) - << ", Y.extent(0) = " << Y.extent(0) - << ", Y.extent(1) = " << Y.extent(1); + << ": X.extent(0) = " << X.extent(0) << ", X.extent(1) = " << X.extent(1) + << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } @@ -476,10 +408,8 @@ struct AxpbyUnificationAttemptTraits { // Check 4/6: AV is valid // ****************************************************************** static_assert( - (a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || - (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || - (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || - (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), + (a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": 'a' must be either scalar or rank 0 or rank 1 static or rank 1 " "dynamic"); @@ -495,8 +425,7 @@ struct AxpbyUnificationAttemptTraits { KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } else { - if ((a.extent(0) == 1) || - (a.extent(0) == Y.extent(1))) { // Yes, 'Y' is the reference + if ((a.extent(0) == 1) || (a.extent(0) == Y.extent(1))) { // Yes, 'Y' is the reference // Ok } else { std::ostringstream msg; @@ -504,8 +433,7 @@ struct AxpbyUnificationAttemptTraits { "performChecks()" << ": view 'a' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" - << ", a.extent(0) = " << a.extent(0) - << ", Y.extent(0) = " << Y.extent(0) + << ", a.extent(0) = " << a.extent(0) << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } @@ -516,10 +444,8 @@ struct AxpbyUnificationAttemptTraits { // Check 5/6: BV is valid // ****************************************************************** static_assert( - (b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || - (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || - (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || - (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), + (b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" ": 'b' must be either scalar or rank 0 or rank 1 static or rank 1 " "dynamic"); @@ -543,8 +469,7 @@ struct AxpbyUnificationAttemptTraits { "performChecks()" << ": view 'b' must have extent(0) == 1 or Y.extent(1) for " "xyRank2Case" - << ", b.extent(0) = " << b.extent(0) - << ", Y.extent(0) = " << Y.extent(0) + << ", b.extent(0) = " << b.extent(0) << ", Y.extent(0) = " << Y.extent(0) << ", Y.extent(1) = " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } @@ -556,147 +481,115 @@ struct AxpbyUnificationAttemptTraits { // ****************************************************************** if constexpr (onHost) { if constexpr (xyRank1Case) { - constexpr bool internalTypeA_isOk = - (internalTypeA_is_scalar || internalTypeA_is_r1d); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeX is wrong"); - - constexpr bool internalTypeB_isOk = - (internalTypeB_is_scalar || internalTypeB_is_r1d); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank1Case: InternalTypeY is wrong"); + constexpr bool internalTypeA_isOk = (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeY is wrong"); } else { - constexpr bool internalTypeA_isOk = - (internalTypeA_is_scalar || internalTypeA_is_r1d); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeX is wrong"); - - constexpr bool internalTypeB_isOk = - (internalTypeB_is_scalar || internalTypeB_is_r1d); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, xyRank2Case: InternalTypeY is wrong"); + constexpr bool internalTypeA_isOk = (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeY is wrong"); } } else { if constexpr (xyRank1Case) { constexpr bool internalTypeA_isOk = - internalTypeA_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeX is wrong"); + internalTypeA_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = - internalTypeB_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank1Case: InternalTypeY is wrong"); + internalTypeB_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeY is wrong"); } else { constexpr bool internalTypeA_isOk = - internalTypeA_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); - static_assert( - internalTypeA_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeA is wrong"); - - constexpr bool internalTypeX_isOk = std::is_same_v< - InternalTypeX, - Kokkos::View>>; - static_assert( - internalTypeX_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeX is wrong"); + internalTypeA_is_r1d || (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert(internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = + std::is_same_v>>; + static_assert(internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeX is wrong"); constexpr bool internalTypeB_isOk = - internalTypeB_is_r1d || - (a_is_scalar && b_is_scalar && internalTypeB_is_scalar); - static_assert( - internalTypeB_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeB is wrong"); - - constexpr bool internalTypeY_isOk = std::is_same_v< - InternalTypeY, - Kokkos::View>>; - static_assert( - internalTypeY_isOk, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, xyRank2Case: InternalTypeY is wrong"); + internalTypeB_is_r1d || (a_is_scalar && b_is_scalar && internalTypeB_is_scalar); + static_assert(internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = + std::is_same_v>>; + static_assert(internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeY is wrong"); } } @@ -714,10 +607,9 @@ struct AxpbyUnificationAttemptTraits { // - [InternalTypeA, B] = [S_a, S_b], or // - [InternalTypeA, B] = [view, view] // **************************************************************** - static_assert( - internalTypesAB_bothScalars || internalTypesAB_bothViews, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onHost, invalid combination of types"); + static_assert(internalTypesAB_bothScalars || internalTypesAB_bothViews, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, invalid combination of types"); } // If onHost else if constexpr (onDevice) { // **************************************************************** @@ -733,35 +625,25 @@ struct AxpbyUnificationAttemptTraits { // - [InternalTypeA, B] = [S_a, S_b], or // - [InternalTypeA, B] = [view, view] // **************************************************************** - static_assert( - internalTypesAB_bothViews || - (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", onDevice, invalid combination of types"); + static_assert(internalTypesAB_bothViews || (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, invalid combination of types"); } - if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && - atInputLayoutA_isStride) { - static_assert( - std::is_same_v< - typename getLayoutFromView< - InternalTypeA, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" - ", but no LayoutStride internally"); + if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && atInputLayoutA_isStride) { + static_assert(std::is_same_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } - if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && - atInputLayoutB_isStride) { - static_assert( - std::is_same_v< - typename getLayoutFromView< - InternalTypeB, Kokkos::is_view_v>::type, - Kokkos::LayoutStride>, - "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" - ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" - ", but no LayoutStride internally"); + if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && atInputLayoutB_isStride) { + static_assert(std::is_same_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); } } // Constructor @@ -776,28 +658,20 @@ struct AxpbyUnificationAttemptTraits { //<< ", AV::non_const_data_type = " << // typeid(AV::non_const_data_type).name() << ", AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() - << ", isConst = " - << std::is_const_v << ", isComplex = " + << ", isConst = " << std::is_const_v << ", isComplex = " << Kokkos::ArithTraits::is_complex - << ", AtInputScalarTypeA_nonConst = " - << typeid(AtInputScalarTypeA_nonConst).name() + << ", AtInputScalarTypeA_nonConst = " << typeid(AtInputScalarTypeA_nonConst).name() << ", InternalTypeA = " << typeid(InternalTypeA).name() << "\n" - << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() - << "\n" + << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() << "\n" << "\n" << "XMV = " << typeid(XMV).name() << "\n" - << "XMV::value_type = " << typeid(typename XMV::value_type).name() - << "\n" - << "XMV::const_data_type = " - << typeid(typename XMV::const_data_type).name() << "\n" - << "XMV::non_const_data_type = " - << typeid(typename XMV::non_const_data_type).name() << "\n" + << "XMV::value_type = " << typeid(typename XMV::value_type).name() << "\n" + << "XMV::const_data_type = " << typeid(typename XMV::const_data_type).name() << "\n" + << "XMV::non_const_data_type = " << typeid(typename XMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " - << Kokkos::ArithTraits::is_complex << "\n" - << "AtInputScalarTypeX_nonConst = " - << typeid(AtInputScalarTypeX_nonConst).name() << "\n" + << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" + << "AtInputScalarTypeX_nonConst = " << typeid(AtInputScalarTypeX_nonConst).name() << "\n" << "InternalTypeX = " << typeid(InternalTypeX).name() << "\n" << "\n" << "BV = " @@ -806,28 +680,20 @@ struct AxpbyUnificationAttemptTraits { //<< ", BV::non_const_data_type = " << // typeid(BV::non_const_data_type).name() << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() - << ", isConst = " - << std::is_const_v << ", isComplex = " + << ", isConst = " << std::is_const_v << ", isComplex = " << Kokkos::ArithTraits::is_complex - << ", AtInputScalarTypeB_nonConst = " - << typeid(AtInputScalarTypeB_nonConst).name() + << ", AtInputScalarTypeB_nonConst = " << typeid(AtInputScalarTypeB_nonConst).name() << ", InternalTypeB = " << typeid(InternalTypeB).name() << "\n" - << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() - << "\n" + << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() << "\n" << "\n" << "YMV = " << typeid(YMV).name() << "\n" - << "YMV::value_type = " << typeid(typename YMV::value_type).name() - << "\n" - << "YMV::const_data_type = " - << typeid(typename YMV::const_data_type).name() << "\n" - << "YMV::non_const_data_type = " - << typeid(typename YMV::non_const_data_type).name() << "\n" + << "YMV::value_type = " << typeid(typename YMV::value_type).name() << "\n" + << "YMV::const_data_type = " << typeid(typename YMV::const_data_type).name() << "\n" + << "YMV::non_const_data_type = " << typeid(typename YMV::non_const_data_type).name() << "\n" << "AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name() << "\n" << "isConst = " << std::is_const_v << "\n" - << "isComplex = " - << Kokkos::ArithTraits::is_complex << "\n" - << "AtInputScalarTypeY_nonConst = " - << typeid(AtInputScalarTypeY_nonConst).name() << "\n" + << "isComplex = " << Kokkos::ArithTraits::is_complex << "\n" + << "AtInputScalarTypeY_nonConst = " << typeid(AtInputScalarTypeY_nonConst).name() << "\n" << "InternalTypeY = " << typeid(InternalTypeY).name() << "\n" << std::endl; } @@ -840,8 +706,7 @@ struct AxpbyUnificationAttemptTraits { template struct getScalarValueFromVariableAtHost { getScalarValueFromVariableAtHost() { - static_assert((rankT == -1) || (rankT == 0) || (rankT == 1), - "Generic struct should not have been invoked!"); + static_assert((rankT == -1) || (rankT == 0) || (rankT == 1), "Generic struct should not have been invoked!"); } }; @@ -879,8 +744,7 @@ template size_t getStrideInCoefficient(T const& coeff) { size_t result = 1; if constexpr (Kokkos::is_view_v) { - if constexpr ((T::rank == 1) && (std::is_same_v)) { + if constexpr ((T::rank == 1) && (std::is_same_v)) { result = coeff.stride_0(); } } @@ -890,8 +754,7 @@ size_t getStrideInCoefficient(T const& coeff) { // -------------------------------- template -static void populateRank1Stride1ViewWithScalarOrNonStrideView( - T_in const& coeff_in, T_out& coeff_out) { +static void populateRank1Stride1ViewWithScalarOrNonStrideView(T_in const& coeff_in, T_out& coeff_out) { // *********************************************************************** // 'coeff_out' is assumed to be rank-1, of LayoutLeft or LayoutRight // @@ -899,8 +762,7 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView( // - a coeff_in that deals with 'double', and // - a coeff_out deals with 'complex' // *********************************************************************** - using ScalarOutType = - typename std::remove_const::type; + using ScalarOutType = typename std::remove_const::type; if constexpr (!Kokkos::is_view_v) { // ********************************************************************* @@ -924,17 +786,13 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView( std::ostringstream msg; msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" << ": 'in' and 'out' should have the same extent(0)" - << ", T_in = " << typeid(T_in).name() - << ", coeff_in.label() = " << coeff_in.label() - << ", coeff_in.extent(0) = " << coeff_in.extent(0) - << ", T_out = " << typeid(T_out).name() - << ", coeff_out.label() = " << coeff_out.label() - << ", coeff_out.extent(0) = " << coeff_out.extent(0); + << ", T_in = " << typeid(T_in).name() << ", coeff_in.label() = " << coeff_in.label() + << ", coeff_in.extent(0) = " << coeff_in.extent(0) << ", T_out = " << typeid(T_out).name() + << ", coeff_out.label() = " << coeff_out.label() << ", coeff_out.extent(0) = " << coeff_out.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } - using ScalarInType = - typename std::remove_const::type; + using ScalarInType = typename std::remove_const::type; if constexpr (std::is_same_v) { coeff_out = coeff_in; } else if (coeff_out.extent(0) == 1) { @@ -946,14 +804,10 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView( std::ostringstream msg; msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" << ": scalar types 'in' and 'out' should be the same" - << ", T_in = " << typeid(T_in).name() - << ", ScalarInType = " << typeid(ScalarInType).name() - << ", coeff_in.label() = " << coeff_in.label() - << ", coeff_in.extent(0) = " << coeff_in.extent(0) - << ", T_out = " << typeid(T_out).name() - << ", ScalarOutType = " << typeid(ScalarOutType).name() - << ", coeff_out.label() = " << coeff_out.label() - << ", coeff_out.extent(0) = " << coeff_out.extent(0); + << ", T_in = " << typeid(T_in).name() << ", ScalarInType = " << typeid(ScalarInType).name() + << ", coeff_in.label() = " << coeff_in.label() << ", coeff_in.extent(0) = " << coeff_in.extent(0) + << ", T_out = " << typeid(T_out).name() << ", ScalarOutType = " << typeid(ScalarOutType).name() + << ", coeff_out.label() = " << coeff_out.label() << ", coeff_out.extent(0) = " << coeff_out.extent(0); KokkosKernels::Impl::throw_runtime_exception(msg.str()); } } diff --git a/blas/impl/KokkosBlas1_dot_impl.hpp b/blas/impl/KokkosBlas1_dot_impl.hpp index 2003f7cc2c..61e7307bc8 100644 --- a/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_impl.hpp @@ -30,8 +30,7 @@ namespace Impl { /// \tparam YVector Type of the second vector y; 1-D View /// \tparam SizeType Type of the row index used in the dot product. /// For best performance, use int instead of size_t here. -template +template struct DotFunctor { typedef SizeType size_type; typedef typename AV::non_const_value_type avalue_type; @@ -44,26 +43,19 @@ struct DotFunctor { DotFunctor(const XVector& x, const YVector& y) : m_x(x), m_y(y) {} void run(const char* label, const execution_space& space, AV result) { - Kokkos::RangePolicy policy(space, 0, - m_x.extent(0)); + Kokkos::RangePolicy policy(space, 0, m_x.extent(0)); Kokkos::parallel_reduce(label, policy, *this, result); } // Prefer const size_type& to const size_type or size_type, // since the compiler has an easier time inlining the former. - KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type& i, - value_type& sum) const { + KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type& i, value_type& sum) const { Kokkos::Details::updateDot(sum, m_x(i), m_y(i)); // sum += m_x(i) * m_y(i) } - KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = Kokkos::ArithTraits::zero(); - } + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = Kokkos::ArithTraits::zero(); } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { - update += source; - } + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { update += source; } }; } // namespace Impl diff --git a/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/blas/impl/KokkosBlas1_dot_mv_impl.hpp index d19e512599..15db366ceb 100644 --- a/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -27,9 +27,8 @@ namespace Impl { template struct Dot_MV_Functor { - using Scalar = typename RV::non_const_value_type; - using IPT = Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>; + using Scalar = typename RV::non_const_value_type; + using IPT = Kokkos::Details::InnerProductSpaceTraits; using dot_type = typename IPT::dot_type; using KAT = Kokkos::ArithTraits; @@ -39,8 +38,7 @@ struct Dot_MV_Functor { XV x; YV y; - size_type - teamsPerDot; // number of teams collectively performing a dot product + size_type teamsPerDot; // number of teams collectively performing a dot product Dot_MV_Functor(const RV& r_, const XV& x_, const YV& y_, int teamsPerDot_) : r(r_), x(x_), y(y_), teamsPerDot(teamsPerDot_) {} @@ -60,13 +58,11 @@ struct Dot_MV_Functor { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(t, begin, end), [&](size_type k, dot_type& update) { - Kokkos::Details::updateDot(update, x.access(k, xcol), - y.access(k, ycol)); + Kokkos::Details::updateDot(update, x.access(k, xcol), y.access(k, ycol)); }, localResult); - Kokkos::single(Kokkos::PerTeam(t), - [&]() { Kokkos::atomic_add(&r(i), Scalar(localResult)); }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), Scalar(localResult)); }); } }; @@ -75,14 +71,12 @@ struct Dot_MV_Functor { template void MV_Dot_Invoke( const execution_space& space, const RV& r, const XV& x, const YV& y, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { size_type numDots = std::max(x.extent(1), y.extent(1)); if (x.extent(0) != y.extent(0)) { std::ostringstream oss; - oss << "KokkosBlas::dot (rank-2): x and y have different lengths (" - << x.extent(0) << " and " << y.extent(0) << ")"; + oss << "KokkosBlas::dot (rank-2): x and y have different lengths (" << x.extent(0) << " and " << y.extent(0) << ")"; throw std::runtime_error(oss.str()); } if ((x.extent(1) != size_t(1) && x.extent(1) != size_t(numDots)) || @@ -95,23 +89,17 @@ void MV_Dot_Invoke( } if (r.extent(0) != size_t(numDots)) { std::ostringstream oss; - oss << "KokkosBlas::dot (rank-2): result vector has wrong length (" - << r.extent(0) << ", but " << numDots + oss << "KokkosBlas::dot (rank-2): result vector has wrong length (" << r.extent(0) << ", but " << numDots << " dot products will be computed)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerDot; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), numDots, teamsPerDot); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), numDots, teamsPerDot); size_type numTeams = numDots * teamsPerDot; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for("Dot_MV", pol, - Dot_MV_Functor( - r, x, y, teamsPerDot)); + Kokkos::parallel_for("Dot_MV", pol, Dot_MV_Functor(r, x, y, teamsPerDot)); } // Version for when a temporary result view is needed (implemented in terms of @@ -119,15 +107,11 @@ void MV_Dot_Invoke( template void MV_Dot_Invoke( const execution_space& space, const RV& r, const XV& x, const YV& y, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), - r.extent(0)); - MV_Dot_Invoke( - space, tempResult, x, y); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), r.extent(0)); + MV_Dot_Invoke(space, tempResult, x, y); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 02efee6bc5..982e2eaa0c 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -54,15 +54,11 @@ struct DotAccumulatingScalar> { template struct HasSpecialAccumulator { - enum : bool { - value = !std::is_same::type>::value - }; + enum : bool { value = !std::is_same::type>::value }; }; // Specialization struct which defines whether a specialization exists -template +template struct dot_eti_spec_avail { enum : bool { value = false }; }; @@ -75,34 +71,27 @@ struct dot_eti_spec_avail { // the declarations of full specializations go in this header file. // We may spread out definitions (see _INST macro below) across one or // more .cpp files. -#define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct dot_eti_spec_avail>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1> { \ + enum : bool { value = true }; \ }; // @@ -112,55 +101,42 @@ struct dot_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct dot_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct dot_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -172,36 +148,28 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = - dot_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = dot_eti_spec_avail::value> struct Dot { - static void dot(const execution_space& space, const RV&, const XV& R, - const YV& X); + static void dot(const execution_space& space, const RV&, const XV& R, const YV& X); }; // This version never has TPL support, but it does use the same ETI system template ::value> + bool eti_spec_avail = dot_eti_spec_avail::value> struct DotSpecialAccumulator { // Note: not doing the static_asserts to validate RV, XV, YV since those // errors would have already arisen when building the library. - using size_type = typename YV::size_type; - using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::dot_type; + using size_type = typename YV::size_type; + using dot_type = typename Kokkos::Details::InnerProductSpaceTraits::dot_type; using accum_type = typename DotAccumulatingScalar::type; // This is the same View type as RV, but using the special accumulator as the // value type - using RV_Result = Kokkos::View>; - static void dot(const execution_space& space, const RV_Result& R, const XV& X, - const YV& Y); + static void dot(const execution_space& space, const RV_Result& R, const XV& X, const YV& Y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -209,8 +177,7 @@ struct DotSpecialAccumulator { // The rank-1 case is currently the only one that may use a different // accumulator type than InnerProductSpaceTraits::dot_type. template -struct Dot { +struct Dot { // Check some things about the template parameters at compile time to get nice // error messages, before using them under the assumption they are valid. static_assert(Kokkos::is_view::value, @@ -231,8 +198,7 @@ struct Dot: " "YV is not rank 1."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Dot<1D>: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -243,23 +209,18 @@ struct Dot> RV_Result; - static void dot(const execution_space& space, const RV& R, const XV& X, - const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::dot[ETI]" - : "KokkosBlas::dot[noETI]"); + static void dot(const execution_space& space, const RV& R, const XV& X, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" + : "KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); else { - printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); } #endif const size_type numElems = X.extent(0); @@ -282,8 +243,7 @@ struct Dot -struct DotSpecialAccumulator { +struct DotSpecialAccumulator { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: XV is not a Kokkos::View."); @@ -299,38 +259,30 @@ struct DotSpecialAccumulator::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: RV is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::DotSpecialAccumulator: X and Y have " "different scalar types."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Dot<1D>: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - using size_type = typename YV::size_type; - using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::dot_type; + using size_type = typename YV::size_type; + using dot_type = typename Kokkos::Details::InnerProductSpaceTraits::dot_type; using accum_type = typename DotAccumulatingScalar::type; // This is the same View type as RV, but using the special accumulator as the // value type - using RV_Result = Kokkos::View>; - static void dot(const execution_space& space, const RV_Result& R, const XV& X, - const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::dot[ETI]" - : "KokkosBlas::dot[noETI]"); + static void dot(const execution_space& space, const RV_Result& R, const XV& X, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" + : "KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); else { - printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", - typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n", typeid(XV).name(), typeid(YV).name()); } #endif const size_type numElems = X.extent(0); @@ -348,10 +300,8 @@ struct DotSpecialAccumulator -struct Dot { +template +struct Dot { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Dot<2-D>: XV is not a Kokkos::View."); @@ -367,29 +317,25 @@ struct Dot - static auto getFirstColumn( - const V& v, typename std::enable_if::type* = nullptr) { + static auto getFirstColumn(const V& v, typename std::enable_if::type* = nullptr) { return Kokkos::subview(v, Kokkos::ALL(), 0); } template - static V getFirstColumn( - const V& v, typename std::enable_if::type* = nullptr) { + static V getFirstColumn(const V& v, typename std::enable_if::type* = nullptr) { return v; } - static void dot(const execution_space& space, const RV& R, const XV& X, - const YV& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::dot[ETI]" - : "KokkosBlas::dot[noETI]"); + static void dot(const execution_space& space, const RV& R, const XV& X, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" + : "KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(XV).name(), + typeid(YV).name()); else { - printf("KokkosBlas1::dot<> non-ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas1::dot<> non-ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(XV).name(), + typeid(YV).name()); } #endif @@ -401,20 +347,15 @@ struct Dot(INT_MAX)) { typedef int index_type; - DotFunctor - f(X0, Y0); + DotFunctor f(X0, Y0); f.run("KokkosBlas::dot<1D>", space, R0); } else { typedef int64_t index_type; - DotFunctor - f(X0, Y0); + DotFunctor f(X0, Y0); f.run("KokkosBlas::dot<1D>", space, R0); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numDots < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numDots < static_cast(INT_MAX)) { typedef int index_type; MV_Dot_Invoke(space, R, X, Y); } else { @@ -437,95 +378,68 @@ struct Dot>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - extern template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - extern template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true>; \ - extern template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Dot>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true>; \ + extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, Kokkos::View>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ true>; -#define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Dot>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, 1, false, true>; \ - template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true>; \ - template struct DotSpecialAccumulator< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Dot>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, 1, false, true>; \ + template struct DotSpecialAccumulator< \ + EXEC_SPACE, Kokkos::View>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true>; \ + template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ true>; // @@ -534,88 +448,62 @@ struct Dot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 2, false, true>; \ - extern template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 1, false, true>; \ - extern template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 2, false, true>; \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 1, false, true>; \ + extern template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ 1, 2, false, true>; -#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 2, false, true>; \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 2, 1, false, true>; \ - template struct Dot< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 2, false, true>; \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 2, 1, false, true>; \ + template struct Dot< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ 1, 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_iamax_impl.hpp b/blas/impl/KokkosBlas1_iamax_impl.hpp index 4c7a3fcc0c..bef00fad8c 100644 --- a/blas/impl/KokkosBlas1_iamax_impl.hpp +++ b/blas/impl/KokkosBlas1_iamax_impl.hpp @@ -29,8 +29,7 @@ namespace Impl { /// \tparam XV 1-D input View /// \tparam MagType Magnitude type /// \tparam SizeType Index type. Use int (32 bits) if possible. -template +template struct V_Iamax_Functor { using size_type = SizeType; using mag_type = MagType; @@ -47,8 +46,7 @@ struct V_Iamax_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Iamax_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Iamax_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -57,8 +55,7 @@ struct V_Iamax_Functor { "RV must have rank 0 and XV must have rank 1."); } - KOKKOS_INLINE_FUNCTION void operator()(const size_type i, - value_type& lmaxloc) const { + KOKKOS_INLINE_FUNCTION void operator()(const size_type i, value_type& lmaxloc) const { mag_type val = IPT::norm(m_x(i - 1)); mag_type maxval = IPT::norm(m_x(lmaxloc - 1)); if (val > maxval) lmaxloc = i; @@ -68,8 +65,7 @@ struct V_Iamax_Functor { update = Kokkos::reduction_identity::max() + 1; } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { mag_type source_val = IPT::norm(m_x(source - 1)); mag_type update_val = IPT::norm(m_x(update - 1)); if (update_val < source_val) update = source; @@ -107,8 +103,7 @@ void MV_Iamax_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_Iamax_Invoke( - space, ri, Xi); + V_Iamax_Invoke(space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_iamax_spec.hpp b/blas/impl/KokkosBlas1_iamax_spec.hpp index 341b949050..80e4cb6036 100644 --- a/blas/impl/KokkosBlas1_iamax_spec.hpp +++ b/blas/impl/KokkosBlas1_iamax_spec.hpp @@ -43,39 +43,29 @@ struct iamax_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct iamax_eti_spec_avail< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for declaration of full specialization availability @@ -84,39 +74,29 @@ struct iamax_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( \ - INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct iamax_eti_spec_avail< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( \ - unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // Include the actual specialization declarations #include @@ -128,10 +108,8 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - iamax_eti_spec_avail::value> + bool tpl_spec_avail = iamax_tpl_spec_avail::value, + bool eti_spec_avail = iamax_eti_spec_avail::value> struct Iamax { static void iamax(const execution_space& space, const RMV& R, const XMV& X); }; @@ -139,8 +117,7 @@ struct Iamax { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Iamax for single vectors (1-D Views). template -struct Iamax { +struct Iamax { typedef typename XMV::size_type size_type; static void iamax(const execution_space& space, const RMV& R, const XMV& X) { @@ -156,16 +133,13 @@ struct Iamax: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::iamax[ETI]" - : "KokkosBlas::iamax[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::iamax[ETI]" + : "KokkosBlas::iamax[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -181,8 +155,7 @@ struct Iamax -struct Iamax { +struct Iamax { typedef typename XMV::size_type size_type; static void iamax(const execution_space& space, const RV& R, const XMV& X) { @@ -198,23 +171,19 @@ struct Iamax: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::iamax[ETI]" - : "KokkosBlas::iamax[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::iamax[ETI]" + : "KokkosBlas::iamax[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Iamax_Invoke(space, R, X); } else { typedef std::int64_t index_type; @@ -235,64 +204,46 @@ struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - extern template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Iamax< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + extern template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for definition of full specialization of // KokkosBlas::Impl::Iamax for rank == 1. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for declaration of full specialization of @@ -301,66 +252,46 @@ struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - extern template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Iamax< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + extern template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) // // Macro for definition of full specialization of // KokkosBlas::Impl::Iamax for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - template struct Iamax< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Iamax< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + template struct Iamax, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) #include diff --git a/blas/impl/KokkosBlas1_mult_impl.hpp b/blas/impl/KokkosBlas1_mult_impl.hpp index 048db395b0..3584240e70 100644 --- a/blas/impl/KokkosBlas1_mult_impl.hpp +++ b/blas/impl/KokkosBlas1_mult_impl.hpp @@ -34,8 +34,7 @@ namespace Impl { /// /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual /// BLAS update rules. -template +template struct MV_MultFunctor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -47,8 +46,8 @@ struct MV_MultFunctor { AV m_A; BMV m_B; - MV_MultFunctor(typename CMV::const_value_type& c, const CMV& C, - typename AV::const_value_type& ab, const AV& A, const BMV& B) + MV_MultFunctor(typename CMV::const_value_type& c, const CMV& C, typename AV::const_value_type& ab, const AV& A, + const BMV& B) : m_n(C.extent(1)), m_c(c), m_C(C), m_ab(ab), m_A(A), m_B(B) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type& i) const { @@ -101,8 +100,7 @@ struct MV_MultFunctor { /// /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual /// BLAS update rules. -template +template struct V_MultFunctor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -113,8 +111,8 @@ struct V_MultFunctor { AV m_A; BV m_B; - V_MultFunctor(typename CV::const_value_type& c, const CV& C, - typename AV::const_value_type& ab, const AV& A, const BV& B) + V_MultFunctor(typename CV::const_value_type& c, const CV& C, typename AV::const_value_type& ab, const AV& A, + const BV& B) : m_c(c), m_C(C), m_ab(ab), m_A(A), m_B(B) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type& i) const { @@ -145,10 +143,8 @@ struct V_MultFunctor { /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual BLAS /// update rules. template -void V_Mult_Generic(const execution_space& space, - typename CV::const_value_type& c, const CV& C, - typename AV::const_value_type& ab, const AV& A, - const BV& B) { +void V_Mult_Generic(const execution_space& space, typename CV::const_value_type& c, const CV& C, + typename AV::const_value_type& ab, const AV& A, const BV& B) { using Kokkos::ALL; using Kokkos::subview; typedef Kokkos::ArithTraits ATA; @@ -192,10 +188,8 @@ void V_Mult_Generic(const execution_space& space, /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual /// BLAS update rules. template -void MV_Mult_Generic(const execution_space& space, - typename CMV::const_value_type& c, const CMV& C, - typename AV::const_value_type& ab, const AV& A, - const BMV& B) { +void MV_Mult_Generic(const execution_space& space, typename CMV::const_value_type& c, const CMV& C, + typename AV::const_value_type& ab, const AV& A, const BMV& B) { typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATC; @@ -205,8 +199,7 @@ void MV_Mult_Generic(const execution_space& space, typedef decltype(C_0) CV; typedef decltype(B_0) BV; - V_Mult_Generic(space, c, C_0, ab, A, - B_0); + V_Mult_Generic(space, c, C_0, ab, A, B_0); return; } diff --git a/blas/impl/KokkosBlas1_mult_spec.hpp b/blas/impl/KokkosBlas1_mult_spec.hpp index c81e00a6b0..3cd847dc1d 100644 --- a/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_mult_spec.hpp @@ -27,8 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_eti_spec_avail { enum : bool { value = false }; }; @@ -42,20 +41,17 @@ struct mult_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct mult_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct mult_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,21 +61,17 @@ struct mult_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct mult_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct mult_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -101,32 +93,24 @@ namespace Impl { /// Y(i,j) = alpha*A(i,j)*X(i,j) + gamma*Y(i,j) /// /// with special cases for alpha, or gamma = 0. -template ::value, - bool eti_spec_avail = - mult_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = mult_eti_spec_avail::value> struct Mult { - static void mult(const execution_space& space, - const typename YMV::non_const_value_type& gamma, - const YMV& Y, - const typename XMV::non_const_value_type& alpha, const AV& A, - const XMV& X); + static void mult(const execution_space& space, const typename YMV::non_const_value_type& gamma, const YMV& Y, + const typename XMV::non_const_value_type& alpha, const AV& A, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for YMV, AV, and XMV rank-2 Views. template -struct Mult { +struct Mult { typedef typename YMV::size_type size_type; typedef typename YMV::non_const_value_type YMV_scalar; typedef typename XMV::non_const_value_type XMV_scalar; - static void mult(const execution_space& space, const YMV_scalar& gamma, - const YMV& Y, const XMV_scalar& alpha, const AV& A, - const XMV& X) { + static void mult(const execution_space& space, const YMV_scalar& gamma, const YMV& Y, const XMV_scalar& alpha, + const AV& A, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Mult::mult: Y is not a Kokkos::View."); @@ -136,8 +120,7 @@ struct Mult::value, "KokkosBlas::Impl::" "Mult::mult: X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Mult::mult: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -149,31 +132,26 @@ struct Mult::mult: " "AV must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::mult[ETI]" - : "KokkosBlas::mult[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::mult[ETI]" + : "KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", - typeid(YMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", typeid(YMV).name(), typeid(AV).name(), + typeid(XMV).name()); else { - printf( - "KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", - typeid(YMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", typeid(YMV).name(), typeid(AV).name(), + typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Mult_Generic(space, gamma, Y, - alpha, A, X); + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + MV_Mult_Generic(space, gamma, Y, alpha, A, X); } else { - MV_Mult_Generic(space, gamma, Y, - alpha, A, X); + MV_Mult_Generic(space, gamma, Y, alpha, A, X); } Kokkos::Profiling::popRegion(); } @@ -181,15 +159,13 @@ struct Mult -struct Mult { +struct Mult { typedef typename YV::size_type size_type; typedef typename YV::non_const_value_type YV_scalar; typedef typename XV::non_const_value_type XV_scalar; - static void mult(const execution_space& space, const YV_scalar& gamma, - const YV& Y, const XV_scalar& alpha, const AV& A, - const XV& X) { + static void mult(const execution_space& space, const YV_scalar& gamma, const YV& Y, const XV_scalar& alpha, + const AV& A, const XV& X) { // YV, AV, and XV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -201,35 +177,30 @@ struct Mult::mult: X is not a Kokkos::View."); // XV must be nonconst (else it can't be an output argument). - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Mult::mult: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); static_assert((int)XV::rank == (int)YV::rank && (int)AV::rank == 1, "KokkosBlas::Impl::Mult::mult: " "X, Y, and Z must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::mult[ETI]" - : "KokkosBlas::mult[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::mult[ETI]" + : "KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", - typeid(YV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n", typeid(YV).name(), typeid(AV).name(), + typeid(XV).name()); else { - printf( - "KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", - typeid(YV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n", typeid(YV).name(), typeid(AV).name(), + typeid(XV).name()); } #endif const size_type numRows = Y.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Mult_Generic(space, gamma, Y, alpha, - A, X); + V_Mult_Generic(space, gamma, Y, alpha, A, X); } else { - V_Mult_Generic(space, gamma, Y, - alpha, A, X); + V_Mult_Generic(space, gamma, Y, alpha, A, X); } Kokkos::Profiling::popRegion(); } @@ -248,30 +219,24 @@ struct Mult, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Mult< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -282,32 +247,24 @@ struct Mult, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Mult< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Mult< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrm1_impl.hpp b/blas/impl/KokkosBlas1_nrm1_impl.hpp index a88c01023e..8ba857c9e9 100644 --- a/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -50,8 +50,7 @@ struct V_Nrm1_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm1_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Nrm1_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -80,11 +79,9 @@ struct Nrm1_MV_Functor { RV r; XV x; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product - Nrm1_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) - : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} + Nrm1_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMem& t) const { @@ -103,9 +100,7 @@ struct Nrm1_MV_Functor { }, localResult); - Kokkos::single(Kokkos::PerTeam(t), [&]() { - Kokkos::atomic_add(&r(i), rvalue_type(localResult)); - }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), rvalue_type(localResult)); }); } }; @@ -128,27 +123,23 @@ void V_Nrm1_Invoke(const execution_space& space, const RV& r, const XV& X) { template void MV_Nrm1_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " << x.extent(1) + << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for( - "KokkosBlas1::Nrm1::S1", pol, - Nrm1_MV_Functor(r, x, teamsPerVec)); + Kokkos::parallel_for("KokkosBlas1::Nrm1::S1", pol, + Nrm1_MV_Functor(r, x, teamsPerVec)); } // Version for when a temporary result view is needed (implemented in terms of @@ -156,15 +147,11 @@ void MV_Nrm1_Invoke( template void MV_Nrm1_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), - r.extent(0)); - MV_Nrm1_Invoke( - space, tempResult, x); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); + MV_Nrm1_Invoke(space, tempResult, x); Kokkos::deep_copy(space, r, tempResult); // Fence needed to ensure that the deep_copy // above finishes before we exit this function diff --git a/blas/impl/KokkosBlas1_nrm1_spec.hpp b/blas/impl/KokkosBlas1_nrm1_spec.hpp index 24f093c736..3977c5225c 100644 --- a/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -43,19 +43,15 @@ struct nrm1_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct nrm1_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,22 +61,17 @@ struct nrm1_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm1_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm1_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -92,10 +83,9 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = XMV::rank, - bool tpl_spec_avail = nrm1_tpl_spec_avail::value, - bool eti_spec_avail = nrm1_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrm1_eti_spec_avail::value> struct Nrm1 { static void nrm1(const execution_space& space, const RMV& R, const XMV& X); }; @@ -103,8 +93,7 @@ struct Nrm1 { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm1 for single vectors (1-D Views). template -struct Nrm1 { +struct Nrm1 { using size_type = typename XMV::size_type; static void nrm1(const execution_space& space, const RMV& R, const XMV& X) { @@ -120,16 +109,13 @@ struct Nrm1: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm1[ETI]" - : "KokkosBlas::nrm1[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm1[ETI]" + : "KokkosBlas::nrm1[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -145,8 +131,7 @@ struct Nrm1 -struct Nrm1 { +struct Nrm1 { using size_type = typename XMV::size_type; static void nrm1(const execution_space& space, const RV& R, const XMV& X) { @@ -165,32 +150,26 @@ struct Nrm1 ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm1[ETI]" - : "KokkosBlas::nrm1[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm1[ETI]" + : "KokkosBlas::nrm1[noETI]"); const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); if (numCols == Kokkos::ArithTraits::one()) { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm1_Invoke(space, - R0, X0); + V_Nrm1_Invoke(space, R0, X0); } else { typedef std::int64_t index_type; - V_Nrm1_Invoke( - space, R0, X0); + V_Nrm1_Invoke(space, R0, X0); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Nrm1_Invoke(space, R, X); } else { using index_type = std::int64_t; @@ -212,34 +191,26 @@ struct Nrm1::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm1::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm1 for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRM1_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Nrm1< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM1_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm1::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -248,19 +219,14 @@ struct Nrm1::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm1< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -268,20 +234,14 @@ struct Nrm1::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm1::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrm2_impl.hpp b/blas/impl/KokkosBlas1_nrm2_impl.hpp index 276023c171..e840d0bfd4 100644 --- a/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -51,8 +51,7 @@ struct V_Nrm2_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm2_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Nrm2_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -67,19 +66,12 @@ struct V_Nrm2_Functor { sum += tmp * tmp; } - KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = AT::zero(); - } + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = AT::zero(); } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { - update += source; - } + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { update += source; } KOKKOS_INLINE_FUNCTION void final(value_type& update) const { - if (m_take_sqrt) - update = - Kokkos::ArithTraits::sqrt(update); + if (m_take_sqrt) update = Kokkos::ArithTraits::sqrt(update); } }; @@ -102,11 +94,9 @@ struct Nrm2_MV_Functor { RV r; XV x; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product - Nrm2_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) - : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} + Nrm2_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMem& t) const { @@ -127,17 +117,14 @@ struct Nrm2_MV_Functor { }, localResult); - Kokkos::single(Kokkos::PerTeam(t), [&]() { - Kokkos::atomic_add(&r(i), rvalue_type(localResult)); - }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), rvalue_type(localResult)); }); } }; /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. template -void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, - const bool& take_sqrt) { +void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); Kokkos::RangePolicy policy(space, 0, numRows); @@ -153,32 +140,26 @@ void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, template void MV_Nrm2_Invoke( const execution_space& space, const RV& r, const XV& x, bool take_sqrt, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " << x.extent(1) + << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for( - "KokkosBlas1::Nrm2::S1", pol, - Nrm2_MV_Functor(r, x, teamsPerVec)); + Kokkos::parallel_for("KokkosBlas1::Nrm2::S1", pol, + Nrm2_MV_Functor(r, x, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for( - "KokkosBlas1::Nrm2::Sqrt", - Kokkos::RangePolicy(space, 0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for("KokkosBlas1::Nrm2::Sqrt", Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } @@ -187,15 +168,11 @@ void MV_Nrm2_Invoke( template void MV_Nrm2_Invoke( const execution_space& space, const RV& r, const XV& x, bool take_sqrt, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), - r.extent(0)); - MV_Nrm2_Invoke( - space, tempResult, x, take_sqrt); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); + MV_Nrm2_Invoke(space, tempResult, x, take_sqrt); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_nrm2_spec.hpp b/blas/impl/KokkosBlas1_nrm2_spec.hpp index 6c21e551a8..4d0b2e1396 100644 --- a/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -43,19 +43,15 @@ struct nrm2_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct nrm2_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,22 +61,17 @@ struct nrm2_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm2_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -92,24 +83,20 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = XMV::rank, - bool tpl_spec_avail = nrm2_tpl_spec_avail::value, - bool eti_spec_avail = nrm2_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrm2_eti_spec_avail::value> struct Nrm2 { - static void nrm2(const execution_space& space, const RMV& R, const XMV& X, - const bool& take_sqrt); + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2 for single vectors (1-D Views). template -struct Nrm2 { +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const execution_space& space, const RMV& R, const XMV& X, - const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<1-D>: RMV is not a Kokkos::View."); @@ -122,16 +109,13 @@ struct Nrm2: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2[ETI]" - : "KokkosBlas::nrm2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2[ETI]" + : "KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -140,20 +124,17 @@ struct Nrm2(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(space, R, X, - take_sqrt); + V_Nrm2_Invoke(space, R, X, take_sqrt); } Kokkos::Profiling::popRegion(); } }; template -struct Nrm2 { +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const execution_space& space, const RV& R, const XMV& X, - const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RV& R, const XMV& X, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<2-D>: RV is not a Kokkos::View."); @@ -166,16 +147,13 @@ struct Nrm2: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2[ETI]" - : "KokkosBlas::nrm2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2[ETI]" + : "KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif @@ -185,21 +163,17 @@ struct Nrm2(INT_MAX)) { - V_Nrm2_Invoke( - space, R0, X0, take_sqrt); + V_Nrm2_Invoke(space, R0, X0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke( - space, R0, X0, take_sqrt); + V_Nrm2_Invoke(space, R0, X0, take_sqrt); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Nrm2_Invoke(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2_Invoke(space, R, X, - take_sqrt); + MV_Nrm2_Invoke(space, R, X, take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -217,34 +191,26 @@ struct Nrm2::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2 for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRM2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Nrm2< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -253,19 +219,14 @@ struct Nrm2::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -273,20 +234,14 @@ struct Nrm2::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/blas/impl/KokkosBlas1_nrm2w_impl.hpp index fb9b1f7858..979ba2cec3 100644 --- a/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -46,16 +46,14 @@ struct V_Nrm2w_Functor { typename XV::const_type m_x, m_w; bool m_take_sqrt; - V_Nrm2w_Functor(const XV& x, const XV& w, bool take_sqrt) - : m_x(x), m_w(w), m_take_sqrt(take_sqrt) { + V_Nrm2w_Functor(const XV& x, const XV& w, bool take_sqrt) : m_x(x), m_w(w), m_take_sqrt(take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm2w_Functor: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Nrm2w_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Nrm2w_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -71,19 +69,12 @@ struct V_Nrm2w_Functor { ; } - KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = AT::zero(); - } + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = AT::zero(); } - KOKKOS_INLINE_FUNCTION void join(value_type& update, - const value_type& source) const { - update += source; - } + KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { update += source; } KOKKOS_INLINE_FUNCTION void final(value_type& update) const { - if (m_take_sqrt) - update = - Kokkos::ArithTraits::sqrt(update); + if (m_take_sqrt) update = Kokkos::ArithTraits::sqrt(update); } }; @@ -101,8 +92,7 @@ struct Nrm2w_MV_Functor { XV x; XV w; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product Nrm2w_MV_Functor(const RV& r_, const XV& x_, const XV& w_, int teamsPerVec_) : r(r_), x(x_), w(w_), teamsPerVec(teamsPerVec_) {} @@ -120,23 +110,19 @@ struct Nrm2w_MV_Functor { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(t, begin, end), [&](size_type k, value_type& update) { - const typename IPT::mag_type tmp = - IPT::norm(x(k, i)) / IPT::norm(w(k, i)); + const typename IPT::mag_type tmp = IPT::norm(x(k, i)) / IPT::norm(w(k, i)); update += tmp * tmp; }, localResult); - Kokkos::single(Kokkos::PerTeam(t), [&]() { - Kokkos::atomic_add(&r(i), rvalue_type(localResult)); - }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), rvalue_type(localResult)); }); } }; /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. template -void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, - const XV& W, const bool& take_sqrt) { +void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, const XV& W, const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); Kokkos::RangePolicy policy(space, 0, numRows); @@ -151,34 +137,27 @@ void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, // be computed in-place template void MV_Nrm2w_Invoke( - const execution_space& space, const RV& r, const XV& x, const XV& w, - bool take_sqrt, - typename std::enable_if::accessible>::type* = + const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " + << x.extent(1) << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("KokkosBlas1::Nrm2w::S1", pol, - Nrm2w_MV_Functor( - r, x, w, teamsPerVec)); + Nrm2w_MV_Functor(r, x, w, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for( - "KokkosBlas1::Nrm2w::Sqrt", - Kokkos::RangePolicy(space, 0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for("KokkosBlas1::Nrm2w::Sqrt", Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } @@ -186,17 +165,12 @@ void MV_Nrm2w_Invoke( // the other version) template void MV_Nrm2w_Invoke( - const execution_space& space, const RV& r, const XV& x, const XV& w, - bool take_sqrt, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"), - r.extent(0)); - MV_Nrm2w_Invoke(space, tempResult, x, w, - take_sqrt); + const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"), r.extent(0)); + MV_Nrm2w_Invoke(space, tempResult, x, w, take_sqrt); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/blas/impl/KokkosBlas1_nrm2w_spec.hpp index f4bbe286ef..5660832139 100644 --- a/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -42,20 +42,15 @@ struct nrm2w_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2W_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm2w_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2w_eti_spec_avail::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,20 +60,16 @@ struct nrm2w_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrm2w_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrm2w_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -91,24 +82,19 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - nrm2w_eti_spec_avail::value> + bool tpl_spec_avail = nrm2w_tpl_spec_avail::value, + bool eti_spec_avail = nrm2w_eti_spec_avail::value> struct Nrm2w { - static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, - const XMV& W, const bool& take_sqrt); + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2w for single vectors (1-D Views). template -struct Nrm2w { +struct Nrm2w { using size_type = typename XMV::size_type; - static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, - const XMV& W, const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<1-D>: RMV is not a Kokkos::View."); @@ -121,16 +107,13 @@ struct Nrm2w: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2w[ETI]" - : "KokkosBlas::nrm2w[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" + : "KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -139,20 +122,17 @@ struct Nrm2w(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(space, R, X, W, - take_sqrt); + V_Nrm2w_Invoke(space, R, X, W, take_sqrt); } Kokkos::Profiling::popRegion(); } }; template -struct Nrm2w { +struct Nrm2w { using size_type = typename XMV::size_type; - static void nrm2w(const execution_space& space, const RV& R, const XMV& X, - const XMV& W, const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<2-D>: RV is not a Kokkos::View."); @@ -165,16 +145,13 @@ struct Nrm2w: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrm2w[ETI]" - : "KokkosBlas::nrm2w[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" + : "KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif @@ -185,22 +162,17 @@ struct Nrm2w(INT_MAX)) { - V_Nrm2w_Invoke( - space, R0, X0, W0, take_sqrt); + V_Nrm2w_Invoke(space, R0, X0, W0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke( - space, R0, X0, W0, take_sqrt); + V_Nrm2w_Invoke(space, R0, X0, W0, take_sqrt); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(space, R, X, W, - take_sqrt); + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2w_Invoke(space, R, X, W, - take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -218,33 +190,25 @@ struct Nrm2w::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2w::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2w for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Nrm2w< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2w::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -253,17 +217,13 @@ struct Nrm2w::mag_type*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Nrm2w< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -271,18 +231,13 @@ struct Nrm2w::mag_type*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Nrm2w::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_nrminf_impl.hpp b/blas/impl/KokkosBlas1_nrminf_impl.hpp index b8431ac8ea..e7479e6697 100644 --- a/blas/impl/KokkosBlas1_nrminf_impl.hpp +++ b/blas/impl/KokkosBlas1_nrminf_impl.hpp @@ -50,8 +50,7 @@ struct V_NrmInf_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_NrmInf_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_NrmInf_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -85,8 +84,7 @@ void V_NrmInf_Invoke(const execution_space& space, const RV& r, const XV& X) { typedef V_NrmInf_Functor functor_type; functor_type op(X); - Kokkos::parallel_reduce("KokkosBlas::NrmInf::S0", policy, op, - Kokkos::Max(r())); + Kokkos::parallel_reduce("KokkosBlas::NrmInf::S0", policy, op, Kokkos::Max(r())); } /// \brief Compute the 2-norms (or their square) of the columns of the @@ -96,8 +94,7 @@ void MV_NrmInf_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_NrmInf_Invoke( - space, ri, Xi); + V_NrmInf_Invoke(space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_nrminf_spec.hpp b/blas/impl/KokkosBlas1_nrminf_spec.hpp index 3659d61f19..e7b365ce85 100644 --- a/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -43,20 +43,15 @@ struct nrminf_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRMINF_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrminf_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrminf_eti_spec_avail::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -66,22 +61,17 @@ struct nrminf_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct nrminf_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct nrminf_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -94,10 +84,8 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - nrminf_eti_spec_avail::value> + bool tpl_spec_avail = nrminf_tpl_spec_avail::value, + bool eti_spec_avail = nrminf_eti_spec_avail::value> struct NrmInf { static void nrminf(const execution_space& space, const RMV& R, const XMV& X); }; @@ -105,8 +93,7 @@ struct NrmInf { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of NrmInf for single vectors (1-D Views). template -struct NrmInf { +struct NrmInf { typedef typename XMV::size_type size_type; static void nrminf(const execution_space& space, const RMV& R, const XMV& X) { @@ -122,16 +109,13 @@ struct NrmInf: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrminf[ETI]" - : "KokkosBlas::nrminf[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrminf[ETI]" + : "KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -147,8 +131,7 @@ struct NrmInf -struct NrmInf { +struct NrmInf { typedef typename XMV::size_type size_type; static void nrminf(const execution_space& space, const RV& R, const XMV& X) { @@ -164,23 +147,19 @@ struct NrmInf: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::nrminf[ETI]" - : "KokkosBlas::nrminf[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrminf[ETI]" + : "KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_NrmInf_Invoke(space, R, X); } else { typedef std::int64_t index_type; @@ -201,36 +180,26 @@ struct NrmInf::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct NrmInf::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for definition of full specialization of // KokkosBlas::Impl::NrmInf for rank == 2. This is NOT for users!!! We // use this macro in one or more .cpp files in this directory. // -#define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct NrmInf< \ - EXEC_SPACE, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, false, true>; +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct NrmInf::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; // // Macro for declaration of full specialization of @@ -239,19 +208,14 @@ struct NrmInf::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct NrmInf< \ + EXEC_SPACE, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -259,20 +223,14 @@ struct NrmInf::mag_type*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct NrmInf::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/blas/impl/KokkosBlas1_reciprocal_impl.hpp index 21f736ac4f..7ad6ab95db 100644 --- a/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -37,8 +37,7 @@ struct MV_Reciprocal_Functor { RMV R_; XMV X_; - MV_Reciprocal_Functor(const RMV& R, const XMV& X) - : numCols(X.extent(1)), R_(R), X_(X) { + MV_Reciprocal_Functor(const RMV& R, const XMV& X) : numCols(X.extent(1)), R_(R), X_(X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Reciprocal_Functor: RMV is not a Kokkos::View."); @@ -148,8 +147,7 @@ struct V_ReciprocalSelf_Functor { // Invoke the "generic" (not unrolled) multivector functor that // computes entry-wise reciprocalolute value. template -void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, - const XMV& X) { +void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Reciprocal_Generic: RMV is not a Kokkos::View."); @@ -177,8 +175,7 @@ void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, // Variant of MV_Reciprocal_Generic for single vectors (1-D Views) R and X. template -void V_Reciprocal_Generic(const execution_space& space, const RV& R, - const XV& X) { +void V_Reciprocal_Generic(const execution_space& space, const RV& R, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Reciprocal_Generic: RV is not a Kokkos::View."); diff --git a/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 08fc8bc341..988043511b 100644 --- a/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -42,18 +42,15 @@ struct reciprocal_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct reciprocal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -63,18 +60,15 @@ struct reciprocal_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct reciprocal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -87,24 +81,19 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - reciprocal_eti_spec_avail::value> + bool tpl_spec_avail = reciprocal_tpl_spec_avail::value, + bool eti_spec_avail = reciprocal_eti_spec_avail::value> struct Reciprocal { - static void reciprocal(const execution_space& space, const RMV& R, - const XMV& X); + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Reciprocal for single vectors (1-D Views). template -struct Reciprocal { +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const execution_space& space, const RMV& R, - const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<1-D>: RMV is not a Kokkos::View."); @@ -117,17 +106,14 @@ struct Reciprocal: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::reciprocal[ETI]" - : "KokkosBlas::reciprocal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::reciprocal[ETI]" + : "KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf( - "KokkosBlas1::reciprocal<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::reciprocal<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), + typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -144,12 +130,10 @@ struct Reciprocal -struct Reciprocal { +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const execution_space& space, const RMV& R, - const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<2-D>: RMV is not a Kokkos::View."); @@ -162,23 +146,19 @@ struct Reciprocal: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::reciprocal[ETI]" - : "KokkosBlas::reciprocal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::reciprocal[ETI]" + : "KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; MV_Reciprocal_Generic(space, R, X); } else { @@ -200,15 +180,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -216,15 +193,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -234,15 +208,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -250,15 +221,12 @@ struct Reciprocal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Reciprocal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_rot_impl.hpp b/blas/impl/KokkosBlas1_rot_impl.hpp index 93d3b3d9b9..e139e916be 100644 --- a/blas/impl/KokkosBlas1_rot_impl.hpp +++ b/blas/impl/KokkosBlas1_rot_impl.hpp @@ -30,8 +30,7 @@ struct rot_functor { VectorView X, Y; ScalarView c, s; - rot_functor(VectorView const& X_, VectorView const& Y_, ScalarView const& c_, - ScalarView const& s_) + rot_functor(VectorView const& X_, VectorView const& Y_, ScalarView const& c_, ScalarView const& s_) : X(X_), Y(Y_), c(c_), s(s_) {} KOKKOS_INLINE_FUNCTION @@ -43,8 +42,8 @@ struct rot_functor { }; template -void Rot_Invoke(ExecutionSpace const& space, VectorView const& X, - VectorView const& Y, ScalarView const& c, ScalarView const& s) { +void Rot_Invoke(ExecutionSpace const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, + ScalarView const& s) { Kokkos::RangePolicy rot_policy(space, 0, X.extent(0)); rot_functor rot_func(X, Y, c, s); Kokkos::parallel_for("KokkosBlas::rot", rot_policy, rot_func); diff --git a/blas/impl/KokkosBlas1_rot_spec.hpp b/blas/impl/KokkosBlas1_rot_spec.hpp index 214e0399e5..4ca4d8d1ef 100644 --- a/blas/impl/KokkosBlas1_rot_spec.hpp +++ b/blas/impl/KokkosBlas1_rot_spec.hpp @@ -43,16 +43,14 @@ struct rot_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct rot_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rot_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -64,36 +62,28 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - rot_eti_spec_avail::value> + bool tpl_spec_avail = rot_tpl_spec_avail::value, + bool eti_spec_avail = rot_eti_spec_avail::value> struct Rot { - static void rot(ExecutionSpace const& space, VectorView const& X, - VectorView const& Y, ScalarView const& c, + static void rot(ExecutionSpace const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, ScalarView const& s); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rot. template -struct Rot { - static void rot(ExecutionSpace const& space, VectorView const& X, - VectorView const& Y, ScalarView const& c, +struct Rot { + static void rot(ExecutionSpace const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, ScalarView const& s) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rot[ETI]" - : "KokkosBlas::rot[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rot[ETI]" + : "KokkosBlas::rot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rot<> ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(VectorView).name(), - typeid(ScalarView).name()); + printf("KokkosBlas1::rot<> ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(VectorView).name(), typeid(ScalarView).name()); else { - printf("KokkosBlas1::rot<> non-ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(VectorView).name(), - typeid(ScalarView).name()); + printf("KokkosBlas1::rot<> non-ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(VectorView).name(), typeid(ScalarView).name()); } #endif Rot_Invoke(space, X, Y, c, s); @@ -112,14 +102,12 @@ struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + extern template struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; // @@ -127,14 +115,12 @@ struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROT_ETI_SPEC_INST(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_rotg_impl.hpp b/blas/impl/KokkosBlas1_rotg_impl.hpp index ff7830e147..834c773a8d 100644 --- a/blas/impl/KokkosBlas1_rotg_impl.hpp +++ b/blas/impl/KokkosBlas1_rotg_impl.hpp @@ -24,10 +24,8 @@ namespace KokkosBlas { namespace Impl { template ::is_complex, - bool>::type = true> -KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, - Scalar* s) { + typename std::enable_if::is_complex, bool>::type = true> +KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, Scalar* s) { const Scalar one = Kokkos::ArithTraits::one(); const Scalar zero = Kokkos::ArithTraits::zero(); @@ -40,12 +38,11 @@ KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, } else { const Scalar scaled_a = *a / numerical_scaling; const Scalar scaled_b = *b / numerical_scaling; - Scalar norm = Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b) * - numerical_scaling; - Scalar sign = Kokkos::abs(*a) > Kokkos::abs(*b) ? *a : *b; - norm = Kokkos::copysign(norm, sign); - *c = *a / norm; - *s = *b / norm; + Scalar norm = Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b) * numerical_scaling; + Scalar sign = Kokkos::abs(*a) > Kokkos::abs(*b) ? *a : *b; + norm = Kokkos::copysign(norm, sign); + *c = *a / norm; + *s = *b / norm; Scalar z = one; if (Kokkos::abs(*a) > Kokkos::abs(*b)) { @@ -60,10 +57,8 @@ KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, } template ::is_complex, - bool>::type = true> -KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, - Scalar* s) { + typename std::enable_if::is_complex, bool>::type = true> +KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, Scalar* s) { using mag_type = typename Kokkos::ArithTraits::mag_type; const Scalar one = Kokkos::ArithTraits::one(); @@ -78,13 +73,11 @@ KOKKOS_INLINE_FUNCTION void rotg_impl(Scalar* a, Scalar* b, Magnitude* c, } else { const Scalar scaled_a = Kokkos::abs(*a / numerical_scaling); const Scalar scaled_b = Kokkos::abs(*b / numerical_scaling); - mag_type norm = - Kokkos::abs(Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b)) * - numerical_scaling; - Scalar unit_a = *a / Kokkos::abs(*a); - *c = Kokkos::abs(*a) / norm; - *s = unit_a * Kokkos::conj(*b) / norm; - *a = unit_a * norm; + mag_type norm = Kokkos::abs(Kokkos::sqrt(scaled_a * scaled_a + scaled_b * scaled_b)) * numerical_scaling; + Scalar unit_a = *a / Kokkos::abs(*a); + *c = Kokkos::abs(*a) / norm; + *s = unit_a * Kokkos::conj(*b) / norm; + *a = unit_a * norm; } } @@ -94,20 +87,17 @@ struct rotg_functor { MViewType c; SViewType s; - rotg_functor(SViewType const& a_, SViewType const& b_, MViewType const& c_, - SViewType const& s_) + rotg_functor(SViewType const& a_, SViewType const& b_, MViewType const& c_, SViewType const& s_) : a(a_), b(b_), c(c_), s(s_) {} KOKKOS_INLINE_FUNCTION - void operator()(int const) const { - rotg_impl(a.data(), b.data(), c.data(), s.data()); - } + void operator()(int const) const { rotg_impl(a.data(), b.data(), c.data(), s.data()); } }; /// \brief Compute Givens rotation coefficients. template -void Rotg_Invoke(ExecutionSpace const& space, SViewType const& a, - SViewType const& b, MViewType const& c, SViewType const& s) { +void Rotg_Invoke(ExecutionSpace const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s) { Kokkos::RangePolicy rotg_policy(space, 0, 1); rotg_functor rotg_func(a, b, c, s); Kokkos::parallel_for("KokkosBlas::rotg", rotg_policy, rotg_func); diff --git a/blas/impl/KokkosBlas1_rotg_spec.hpp b/blas/impl/KokkosBlas1_rotg_spec.hpp index bdf313e3d0..87618f12c9 100644 --- a/blas/impl/KokkosBlas1_rotg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotg_spec.hpp @@ -42,16 +42,14 @@ struct rotg_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROTG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct rotg_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -63,34 +61,28 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - rotg_eti_spec_avail::value> + bool tpl_spec_avail = rotg_tpl_spec_avail::value, + bool eti_spec_avail = rotg_eti_spec_avail::value> struct Rotg { - static void rotg(ExecutionSpace const& space, SViewType const& a, - SViewType const& b, MViewType const& c, SViewType const& s); + static void rotg(ExecutionSpace const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rotg. template -struct Rotg { - static void rotg(ExecutionSpace const& space, SViewType const& a, - SViewType const& b, MViewType const& c, SViewType const& s) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rotg[ETI]" - : "KokkosBlas::rotg[noETI]"); +struct Rotg { + static void rotg(ExecutionSpace const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rotg[ETI]" + : "KokkosBlas::rotg[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rotg<> ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(SViewType).name(), - typeid(MViewType).name()); + printf("KokkosBlas1::rotg<> ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(SViewType).name(), typeid(MViewType).name()); else { - printf("KokkosBlas1::rotg<> non-ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(SViewType).name(), - typeid(MViewType).name()); + printf("KokkosBlas1::rotg<> non-ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(SViewType).name(), typeid(MViewType).name()); } #endif Rotg_Invoke(space, a, b, c, s); @@ -109,14 +101,12 @@ struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_DECL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + extern template struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; // @@ -124,14 +114,12 @@ struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_INST(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_rotm_impl.hpp b/blas/impl/KokkosBlas1_rotm_impl.hpp index 91a2c7a1d8..697cb7902f 100644 --- a/blas/impl/KokkosBlas1_rotm_impl.hpp +++ b/blas/impl/KokkosBlas1_rotm_impl.hpp @@ -36,9 +36,7 @@ struct rotm_functor { VectorView X, Y; ParamView param; - rotm_functor(VectorView const& X_, VectorView const& Y_, - ParamView const& param_) - : X(X_), Y(Y_), param(param_) {} + rotm_functor(VectorView const& X_, VectorView const& Y_, ParamView const& param_) : X(X_), Y(Y_), param(param_) {} KOKKOS_INLINE_FUNCTION void operator()(const minus_one_tag&, const int idx) const { @@ -63,11 +61,9 @@ struct rotm_functor { }; template -void Rotm_Invoke(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param) { +void Rotm_Invoke(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { using Scalar = typename VectorView::value_type; - static_assert(!Kokkos::ArithTraits::is_complex, - "rotm is not defined for complex types!"); + static_assert(!Kokkos::ArithTraits::is_complex, "rotm is not defined for complex types!"); Scalar const zero = Kokkos::ArithTraits::zero(); Scalar const one = Kokkos::ArithTraits::one(); @@ -82,24 +78,19 @@ void Rotm_Invoke(execution_space const& space, VectorView const& X, if (flag == -two) { return; } else if (flag == -one) { - Kokkos::RangePolicy< - execution_space, - typename rotm_functor::minus_one_tag> - rotm_policy(space, 0, X.extent(0)); + Kokkos::RangePolicy::minus_one_tag> rotm_policy( + space, 0, X.extent(0)); Kokkos::parallel_for("KokkosBlas1::rotm_minus_one", rotm_policy, myFunc); } else if (flag == zero) { - Kokkos::RangePolicy::zero_tag> - rotm_policy(space, 0, X.extent(0)); + Kokkos::RangePolicy::zero_tag> rotm_policy( + space, 0, X.extent(0)); Kokkos::parallel_for("KokkosBlas1::rotm_zero", rotm_policy, myFunc); } else if (flag == one) { - Kokkos::RangePolicy::one_tag> - rotm_policy(space, 0, X.extent(0)); + Kokkos::RangePolicy::one_tag> rotm_policy( + space, 0, X.extent(0)); Kokkos::parallel_for("KokkosBlas1::rotm_one", rotm_policy, myFunc); } else { - throw std::runtime_error( - "KokkosBlas::rotm: param(0) is not -2, -1, 0 or 1!"); + throw std::runtime_error("KokkosBlas::rotm: param(0) is not -2, -1, 0 or 1!"); } } diff --git a/blas/impl/KokkosBlas1_rotm_spec.hpp b/blas/impl/KokkosBlas1_rotm_spec.hpp index 854f2abacc..5000b35fc3 100644 --- a/blas/impl/KokkosBlas1_rotm_spec.hpp +++ b/blas/impl/KokkosBlas1_rotm_spec.hpp @@ -41,16 +41,14 @@ struct rotm_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROTM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct rotm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -61,34 +59,27 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class VectorView, class ParamView, - bool tpl_spec_avail = - rotm_tpl_spec_avail::value, - bool eti_spec_avail = - rotm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = rotm_eti_spec_avail::value> struct Rotm { - static void rotm(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param); + static void rotm(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rotm. template -struct Rotm { - static void rotm(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rotm[ETI]" - : "KokkosBlas::rotm[noETI]"); +struct Rotm { + static void rotm(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rotm[ETI]" + : "KokkosBlas::rotm[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rotm<> ETI specialization for < %s, %s >\n", - typeid(VectorView).name(), typeid(ParamView).name()); + printf("KokkosBlas1::rotm<> ETI specialization for < %s, %s >\n", typeid(VectorView).name(), + typeid(ParamView).name()); else { - printf("KokkosBlas1::rotm<> non-ETI specialization for < %s, %s >\n", - typeid(VectorView).name(), typeid(ParamView).name()); + printf("KokkosBlas1::rotm<> non-ETI specialization for < %s, %s >\n", typeid(VectorView).name(), + typeid(ParamView).name()); } #endif Rotm_Invoke(space, X, Y, param); @@ -107,14 +98,12 @@ struct Rotm, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; // @@ -122,14 +111,12 @@ struct Rotm, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_rotmg_impl.hpp b/blas/impl/KokkosBlas1_rotmg_impl.hpp index b35fd62ece..558020e5a4 100644 --- a/blas/impl/KokkosBlas1_rotmg_impl.hpp +++ b/blas/impl/KokkosBlas1_rotmg_impl.hpp @@ -25,8 +25,7 @@ namespace KokkosBlas { namespace Impl { template -KOKKOS_INLINE_FUNCTION void rotmg_impl(DXView const& d1, DXView const& d2, - DXView const& x1, YView const& y1, +KOKKOS_INLINE_FUNCTION void rotmg_impl(DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, PView const& param) { using Scalar = typename DXView::non_const_value_type; @@ -133,8 +132,7 @@ KOKKOS_INLINE_FUNCTION void rotmg_impl(DXView const& d1, DXView const& d2, // Rescale d2, h21 and h22 if (d2() != zero) { - while ((Kokkos::abs(d2()) <= gammasqinv) || - (Kokkos::abs(d2()) >= gammasq)) { + while ((Kokkos::abs(d2()) <= gammasqinv) || (Kokkos::abs(d2()) >= gammasq)) { if (flag == zero) { h11 = one; h22 = one; @@ -182,8 +180,7 @@ struct rotmg_functor { YView y1; PView param; - rotmg_functor(DXView& d1_, DXView& d2_, DXView& x1_, const YView& y1_, - PView& param_) + rotmg_functor(DXView& d1_, DXView& d2_, DXView& x1_, const YView& y1_, PView& param_) : d1(d1_), d2(d2_), x1(x1_), y1(y1_), param(param_) {} KOKKOS_INLINE_FUNCTION @@ -191,12 +188,10 @@ struct rotmg_functor { }; template -void Rotmg_Invoke(execution_space const& space, DXView const& d1, - DXView const& d2, DXView const& x1, YView const& y1, +void Rotmg_Invoke(execution_space const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, PView const& param) { using Scalar = typename DXView::value_type; - static_assert(!Kokkos::ArithTraits::is_complex, - "rotmg is not defined for complex types!"); + static_assert(!Kokkos::ArithTraits::is_complex, "rotmg is not defined for complex types!"); rotmg_functor myFunc(d1, d2, x1, y1, param); Kokkos::RangePolicy rotmg_policy(space, 0, 1); diff --git a/blas/impl/KokkosBlas1_rotmg_spec.hpp b/blas/impl/KokkosBlas1_rotmg_spec.hpp index b90a158654..caa44dda5d 100644 --- a/blas/impl/KokkosBlas1_rotmg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotmg_spec.hpp @@ -41,19 +41,16 @@ struct rotmg_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_ROTMG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -64,38 +61,30 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class DXView, class YView, class PView, - bool tpl_spec_avail = - rotmg_tpl_spec_avail::value, - bool eti_spec_avail = - rotmg_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = rotmg_eti_spec_avail::value> struct Rotmg { - static void rotmg(execution_space const& space, DXView& d1, DXView& d2, - DXView& x1, YView& y1, PView& param); + static void rotmg(execution_space const& space, DXView& d1, DXView& d2, DXView& x1, YView& y1, PView& param); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Rotmg. template -struct Rotmg { - static void rotmg(execution_space const& space, DXView& d1, DXView& d2, - DXView& x1, YView& y1, PView& param) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::rotmg[ETI]" - : "KokkosBlas::rotmg[noETI]"); +struct Rotmg { + static void rotmg(execution_space const& space, DXView& d1, DXView& d2, DXView& x1, YView& y1, PView& param) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::rotmg[ETI]" + : "KokkosBlas::rotmg[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::rotmg<> ETI specialization for < %s, %s, %s >\n", - typeid(DXView).name(), typeid(YView).name(), typeid(PView).name()); + printf("KokkosBlas1::rotmg<> ETI specialization for < %s, %s, %s >\n", typeid(DXView).name(), + typeid(YView).name(), typeid(PView).name()); else { - printf("KokkosBlas1::rotmg<> non-ETI specialization for < %s, %s, %s >\n", - typeid(DXView).name(), typeid(YView).name(), typeid(PView).name()); + printf("KokkosBlas1::rotmg<> non-ETI specialization for < %s, %s, %s >\n", typeid(DXView).name(), + typeid(YView).name(), typeid(PView).name()); } #endif - Rotmg_Invoke(space, d1, d2, x1, y1, - param); + Rotmg_Invoke(space, d1, d2, x1, y1, param); Kokkos::Profiling::popRegion(); } }; @@ -111,16 +100,13 @@ struct Rotmg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; // @@ -128,16 +114,13 @@ struct Rotmg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_scal_impl.hpp b/blas/impl/KokkosBlas1_scal_impl.hpp index 541d9a4934..510ca3808f 100644 --- a/blas/impl/KokkosBlas1_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_impl.hpp @@ -51,23 +51,16 @@ struct V_Scal_Functor { XV m_x; AV m_a; - V_Scal_Functor(const RV& r, const XV& x, const AV& a, - const SizeType startingColumn) - : m_r(r), m_x(x), m_a(a) { - static_assert(Kokkos::is_view::value, - "V_Scal_Functor: RV is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "V_Scal_Functor: AV is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "V_Scal_Functor: XV is not a Kokkos::View."); + V_Scal_Functor(const RV& r, const XV& x, const AV& a, const SizeType startingColumn) : m_r(r), m_x(x), m_a(a) { + static_assert(Kokkos::is_view::value, "V_Scal_Functor: RV is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "V_Scal_Functor: AV is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "V_Scal_Functor: XV is not a Kokkos::View."); static_assert(RV::rank == 1, "V_Scal_Functor: RV is not rank 1."); static_assert(AV::rank == 1, "V_Scal_Functor: AV is not rank 1."); static_assert(XV::rank == 1, "V_Scal_Functor: XV is not rank 1."); if (startingColumn != 0) { - m_a = Kokkos::subview( - a, - std::make_pair(startingColumn, static_cast(a.extent(0)))); + m_a = Kokkos::subview(a, std::make_pair(startingColumn, static_cast(a.extent(0)))); } } @@ -98,8 +91,7 @@ struct V_Scal_Functor { // 1. Y(i) = alpha*X(i) for alpha in -1,0,1 // 2. Y(i) = a*X(i) template -struct V_Scal_Functor { +struct V_Scal_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -107,8 +99,7 @@ struct V_Scal_Functor -void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, - const XV& x, const SizeType startingColumn, int a = 2) { - static_assert(Kokkos::is_view::value, - "V_Scal_Generic: RV is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "V_Scal_Generic: XV is not a Kokkos::View."); +void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, const XV& x, const SizeType startingColumn, + int a = 2) { + static_assert(Kokkos::is_view::value, "V_Scal_Generic: RV is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "V_Scal_Generic: XV is not a Kokkos::View."); static_assert(RV::rank == 1, "V_Scal_Generic: RV is not rank 1."); static_assert(XV::rank == 1, "V_Scal_Generic: XV is not rank 1."); diff --git a/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/blas/impl/KokkosBlas1_scal_mv_impl.hpp index da4d7a5149..a729e85025 100644 --- a/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -42,8 +42,7 @@ namespace Impl { // coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does not apply to // coefficients in the a vector, if they are used. -template +template struct MV_Scal_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -53,13 +52,11 @@ struct MV_Scal_Functor { XMV X_; aVector a_; - MV_Scal_Functor(const RMV& R, const XMV& X, const aVector& a, - const SizeType startingColumn) + MV_Scal_Functor(const RMV& R, const XMV& X, const aVector& a, const SizeType startingColumn) : numCols(X.extent(1)), R_(R), X_(X), a_(a) { if (startingColumn != 0) { - auto rng = - std::make_pair(startingColumn, static_cast(a.extent(0))); - a_ = Kokkos::subview(a, rng); + auto rng = std::make_pair(startingColumn, static_cast(a.extent(0))); + a_ = Kokkos::subview(a, rng); } } @@ -124,8 +121,7 @@ struct MV_Scal_Functor { // This version works by partial specialization on aVector. // In this partial specialization, aVector is a scalar. template -struct MV_Scal_Functor { +struct MV_Scal_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -134,8 +130,7 @@ struct MV_Scal_Functor +template struct MV_Scal_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -203,13 +197,11 @@ struct MV_Scal_Unroll_Functor { XMV m_x; aVector m_a; - MV_Scal_Unroll_Functor(const RMV& r, const XMV& x, const aVector& a, - const SizeType startingColumn) + MV_Scal_Unroll_Functor(const RMV& r, const XMV& x, const aVector& a, const SizeType startingColumn) : m_r(r), m_x(x), m_a(a) { if (startingColumn != 0) { - auto rng = - std::make_pair(startingColumn, static_cast(a.extent(0))); - m_a = Kokkos::subview(a, rng); + auto rng = std::make_pair(startingColumn, static_cast(a.extent(0))); + m_a = Kokkos::subview(a, rng); } } @@ -254,8 +246,7 @@ struct MV_Scal_Unroll_Functor { // than a vector of coefficients) a. The number of columns in X, // UNROLL, is a compile-time constant. template -struct MV_Scal_Unroll_Functor { +struct MV_Scal_Unroll_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -263,8 +254,7 @@ struct MV_Scal_Unroll_Functor -void MV_Scal_Unrolled(const execution_space& space, const RMV& r, - const aVector& av, const XMV& x, +template +void MV_Scal_Unrolled(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, const SizeType startingColumn, int a = 2) { if (a == 0) { - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S0", policy, op); return; } if (a == -1) { - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S1", policy, op); return; } if (a == 1) { - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S2", policy, op); @@ -350,8 +335,7 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, } // a arbitrary (not -1, 0, or 1) - MV_Scal_Unroll_Functor op( - r, x, av, startingColumn); + MV_Scal_Unroll_Functor op(r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S3", policy, op); @@ -371,36 +355,30 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Generic(const execution_space& space, const RVector& r, - const aVector& av, const XVector& x, +template +void MV_Scal_Generic(const execution_space& space, const RVector& r, const aVector& av, const XVector& x, const SizeType startingColumn, int a = 2) { const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S4", policy, op); return; } if (a == -1) { - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S5", policy, op); return; } if (a == 1) { - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S6", policy, op); return; } // a arbitrary (not -1, 0, or 1) - MV_Scal_Functor op(r, x, av, - startingColumn); + MV_Scal_Functor op(r, x, av, startingColumn); Kokkos::parallel_for("KokkosBlas::Scal::MV::S7", policy, op); } @@ -419,8 +397,7 @@ void MV_Scal_Generic(const execution_space& space, const RVector& r, // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. template -void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, - const AV& av, const XMV& x, int a = 2) { +void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); #if KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL <= 2 @@ -437,8 +414,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled( - space, R_cur, av, X_cur, j, a); + MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); } for (; j + 4 <= numCols; j += 4) { const std::pair rng(j, j + 4); @@ -447,8 +423,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled( - space, R_cur, av, X_cur, j, a); + MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); } for (; j < numCols; ++j) { // RMV and XMV need to turn 1-D. @@ -457,8 +432,7 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(r_cur) RV; typedef decltype(x_cur) XV; - V_Scal_Generic(space, r_cur, av, - x_cur, j, a); + V_Scal_Generic(space, r_cur, av, x_cur, j, a); } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2 @@ -470,73 +444,25 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, typedef decltype(r_0) RV; typedef decltype(x_0) XV; - V_Scal_Generic(space, r_0, av, x_0, - 0, a); + V_Scal_Generic(space, r_0, av, x_0, 0, a); break; } - case 2: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 3: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 4: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 5: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 6: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 7: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 8: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 9: - MV_Scal_Unrolled(space, r, av, - x, 0, a); - break; - case 10: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 11: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 12: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 13: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 14: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 15: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - case 16: - MV_Scal_Unrolled( - space, r, av, x, 0, a); - break; - default: - MV_Scal_Generic(space, r, av, x, - 0, a); + case 2: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 3: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 4: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 5: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 6: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 7: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 8: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 9: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 10: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 11: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 12: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 13: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 14: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 15: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 16: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + default: MV_Scal_Generic(space, r, av, x, 0, a); } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL @@ -556,27 +482,23 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, - const aVector& av, const XMV& x, int a = 2) { +template +void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); if (numCols == 1) { - typedef Kokkos::View + typedef Kokkos::View RV; - typedef Kokkos::View + typedef Kokkos::View XV; RV r_0 = Kokkos::subview(r, Kokkos::ALL(), 0); XV x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); - V_Scal_Generic(space, r_0, - av, x_0, a); + V_Scal_Generic(space, r_0, av, x_0, a); } else { - MV_Scal_Generic(space, r, av, - x, a); + MV_Scal_Generic(space, r, av, x, a); } } diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index 38972b2223..70a95d33e2 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -29,8 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_eti_spec_avail { enum : bool { value = false }; }; @@ -44,18 +43,16 @@ struct scal_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct scal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -65,33 +62,27 @@ struct scal_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct scal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct scal_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct scal_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -103,28 +94,22 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = - scal_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = scal_eti_spec_avail::value> struct Scal { - static void scal(const execution_space& space, const RV& R, const AV& A, - const XV& X); + static void scal(const execution_space& space, const RV& R, const AV& A, const XV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Scal for single vectors (1-D Views). template -struct Scal { +struct Scal { typedef typename XV::non_const_value_type AV; typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RV& R, const AV& alpha, - const XV& X) { + static void scal(const execution_space& space, const RV& R, const AV& alpha, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<1-D>: RV is not a Kokkos::View."); @@ -137,18 +122,16 @@ struct Scal: " "XV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::scal[ETI]" - : "KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::scal[ETI]" + : "KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::scal<1D> ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::scal<1D> ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(AV).name(), + typeid(XV).name()); else - printf( - "KokkosBlas1::scal<1D> non-ETI specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(AV).name(), typeid(XV).name()); + printf("KokkosBlas1::scal<1D> non-ETI specialization for < %s , %s , %s >\n", typeid(RV).name(), + typeid(AV).name(), typeid(XV).name()); #endif const size_type numRows = X.extent(0); @@ -163,12 +146,10 @@ struct Scal(INT_MAX)) { typedef int index_type; - V_Scal_Generic(space, R, alpha, - X, a); + V_Scal_Generic(space, R, alpha, X, a); } else { typedef typename XV::size_type index_type; - V_Scal_Generic(space, R, alpha, - X, a); + V_Scal_Generic(space, R, alpha, X, a); } Kokkos::Profiling::popRegion(); } @@ -181,13 +162,11 @@ struct Scal -struct Scal { +struct Scal { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RMV& R, const AV& av, - const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& av, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D>: RMV is not a Kokkos::View."); @@ -206,31 +185,26 @@ struct Scal: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::scal[ETI]" - : "KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::scal[ETI]" + : "KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), typeid(AV).name(), + typeid(XMV).name()); else - printf( - "KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), + typeid(AV).name(), typeid(XMV).name()); #endif const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); const int a = (av.extent(0) == 0) ? 0 : 2; - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(space, R, - av, X, a); + MV_Scal_Invoke_Left(space, R, av, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(space, R, - av, X, a); + MV_Scal_Invoke_Left(space, R, av, X, a); } Kokkos::Profiling::popRegion(); } @@ -243,14 +217,13 @@ struct Scal -struct Scal { +struct Scal { typedef typename XMV::non_const_value_type AV; typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RMV& R, const AV& alpha, - const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& alpha, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D, AV=scalar>: RMV is not a Kokkos::View."); @@ -263,18 +236,16 @@ struct Scal: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::scal[ETI]" - : "KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::scal[ETI]" + : "KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), typeid(AV).name(), + typeid(XMV).name()); else - printf( - "KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", - typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name()); + printf("KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n", typeid(RMV).name(), + typeid(AV).name(), typeid(XMV).name()); #endif const size_type numRows = X.extent(0); @@ -288,17 +259,14 @@ struct Scal(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left( - space, R, alpha, X, a); + MV_Scal_Invoke_Left(space, R, alpha, X, + a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left( - space, R, alpha, X, a); + MV_Scal_Invoke_Left(space, R, alpha, X, + a); } Kokkos::Profiling::popRegion(); } @@ -315,26 +283,22 @@ struct Scal, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_SCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -343,50 +307,38 @@ struct Scal, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - extern template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + extern template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2, false, true>; \ - template struct Scal< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2, false, true>; \ + template struct Scal< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_serial_scal_impl.hpp b/blas/impl/KokkosBlas1_serial_scal_impl.hpp index 4de4f18cc2..d783841929 100644 --- a/blas/impl/KokkosBlas1_serial_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_serial_scal_impl.hpp @@ -28,8 +28,7 @@ namespace Impl { struct SerialScaleInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -39,10 +38,8 @@ struct SerialScaleInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (as0 > as1) for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); else diff --git a/blas/impl/KokkosBlas1_set_impl.hpp b/blas/impl/KokkosBlas1_set_impl.hpp index 38604dc4b2..037720253b 100644 --- a/blas/impl/KokkosBlas1_set_impl.hpp +++ b/blas/impl/KokkosBlas1_set_impl.hpp @@ -30,8 +30,7 @@ namespace Impl { struct SerialSetInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -41,10 +40,8 @@ struct SerialSetInternal { } template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (as0 > as1) for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); else @@ -59,32 +56,22 @@ struct SerialSetInternal { /// ================== struct TeamSetInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), - [&](const int &i) { A[i * as0] = alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { A[i * as0] = alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - SerialSetInternal::invoke(n, alpha, A + i * as0, as1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { SerialSetInternal::invoke(n, alpha, A + i * as0, as1); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - SerialSetInternal::invoke(m, alpha, A + j * as1, as0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { SerialSetInternal::invoke(m, alpha, A + j * as1, as0); }); } // member.team_barrier(); return 0; @@ -96,36 +83,24 @@ struct TeamSetInternal { /// ======================== struct TeamVectorSetInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { A[i * as0] = alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { A[i * as0] = alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] = alpha; }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A[i * as0 + j * as1] = alpha; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] = alpha; }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { A[i * as0 + j * as1] = alpha; }); + }); } // member.team_barrier(); return 0; diff --git a/blas/impl/KokkosBlas1_sum_impl.hpp b/blas/impl/KokkosBlas1_sum_impl.hpp index 864c983541..222982dc24 100644 --- a/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/blas/impl/KokkosBlas1_sum_impl.hpp @@ -51,8 +51,7 @@ struct V_Sum_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::V_Sum_Functor: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Sum_Functor: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -75,11 +74,9 @@ struct Sum_MV_Functor { RV r; XV x; - size_type - teamsPerVec; // number of teams collectively performing a dot product + size_type teamsPerVec; // number of teams collectively performing a dot product - Sum_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) - : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} + Sum_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_) : r(r_), x(x_), teamsPerVec(teamsPerVec_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMem& t) const { @@ -92,12 +89,10 @@ struct Sum_MV_Functor { value_type localResult = AT::zero(); Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(t, begin, end), - [&](size_type k, value_type& update) { update += x(k, i); }, + Kokkos::TeamThreadRange(t, begin, end), [&](size_type k, value_type& update) { update += x(k, i); }, localResult); - Kokkos::single(Kokkos::PerTeam(t), - [&]() { Kokkos::atomic_add(&r(i), localResult); }); + Kokkos::single(Kokkos::PerTeam(t), [&]() { Kokkos::atomic_add(&r(i), localResult); }); } }; @@ -120,27 +115,23 @@ void V_Sum_Invoke(const execution_space& space, const RV& r, const XV& X) { template void MV_Sum_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = + typename std::enable_if::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; - oss << "KokkosBlas::Sum (rank-2): result vector has wrong length (" - << r.extent(0) << ", but x has " << x.extent(1) << " columns)"; + oss << "KokkosBlas::Sum (rank-2): result vector has wrong length (" << r.extent(0) << ", but x has " << x.extent(1) + << " columns)"; throw std::runtime_error(oss.str()); } // Zero out the result vector - Kokkos::deep_copy( - space, r, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; - KokkosBlas::Impl::multipleReductionWorkDistribution( - x.extent(0), x.extent(1), teamsPerVec); + KokkosBlas::Impl::multipleReductionWorkDistribution(x.extent(0), x.extent(1), + teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); - Kokkos::parallel_for( - "KokkosBlas1::Sum::S1", pol, - Sum_MV_Functor(r, x, teamsPerVec)); + Kokkos::parallel_for("KokkosBlas1::Sum::S1", pol, + Sum_MV_Functor(r, x, teamsPerVec)); } // Version for when a temporary result view is needed (implemented in terms of @@ -148,15 +139,11 @@ void MV_Sum_Invoke( template void MV_Sum_Invoke( const execution_space& space, const RV& r, const XV& x, - typename std::enable_if::accessible>::type* = - nullptr) { - Kokkos::View - tempResult( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), - r.extent(0)); - MV_Sum_Invoke( - space, tempResult, x); + typename std::enable_if< + !Kokkos::SpaceAccessibility::accessible>::type* = nullptr) { + Kokkos::View tempResult( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), r.extent(0)); + MV_Sum_Invoke(space, tempResult, x); Kokkos::deep_copy(space, r, tempResult); space.fence(); } diff --git a/blas/impl/KokkosBlas1_sum_spec.hpp b/blas/impl/KokkosBlas1_sum_spec.hpp index 458e7ffdb7..6df41e0309 100644 --- a/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/blas/impl/KokkosBlas1_sum_spec.hpp @@ -43,17 +43,14 @@ struct sum_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct sum_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct sum_eti_spec_avail >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -63,20 +60,16 @@ struct sum_eti_spec_avail { // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct sum_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct sum_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -88,10 +81,9 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template < - class execution_space, class RMV, class XMV, int rank = XMV::rank, - bool tpl_spec_avail = sum_tpl_spec_avail::value, - bool eti_spec_avail = sum_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = sum_eti_spec_avail::value> struct Sum { static void sum(const execution_space& space, const RMV& R, const XMV& X); }; @@ -99,8 +91,7 @@ struct Sum { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Sum for single vectors (1-D Views). template -struct Sum { +struct Sum { typedef typename XMV::size_type size_type; static void sum(const execution_space& space, const RMV& R, const XMV& X) { @@ -116,17 +107,14 @@ struct Sum: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::sum[ETI]" - : "KokkosBlas::sum[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::sum[ETI]" + : "KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", - typeid(RMV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", typeid(RMV).name(), typeid(XMV).name()); } #endif const size_type numRows = X.extent(0); @@ -142,8 +130,7 @@ struct Sum -struct Sum { +struct Sum { typedef typename XMV::size_type size_type; static void sum(const execution_space& space, const RV& R, const XMV& X) { @@ -159,16 +146,13 @@ struct Sum: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::sum[ETI]" - : "KokkosBlas::sum[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::sum[ETI]" + : "KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); else { - printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XMV).name()); + printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n", typeid(RV).name(), typeid(XMV).name()); } #endif @@ -178,16 +162,13 @@ struct Sum(INT_MAX)) { - V_Sum_Invoke(space, - R0, X0); + V_Sum_Invoke(space, R0, X0); } else { typedef std::int64_t index_type; - V_Sum_Invoke( - space, R0, X0); + V_Sum_Invoke(space, R0, X0); } } else { - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { MV_Sum_Invoke(space, R, X); } else { typedef std::int64_t index_type; @@ -209,14 +190,11 @@ struct Sum >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Sum< \ + EXEC_SPACE, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -224,13 +202,11 @@ struct Sum >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Sum >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -240,17 +216,13 @@ struct Sum, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Sum< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; // @@ -258,17 +230,13 @@ struct Sum, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Sum< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas1_swap_impl.hpp b/blas/impl/KokkosBlas1_swap_impl.hpp index 32a13d6469..7d4d22b514 100644 --- a/blas/impl/KokkosBlas1_swap_impl.hpp +++ b/blas/impl/KokkosBlas1_swap_impl.hpp @@ -42,8 +42,7 @@ struct swap_functor { }; template -void Swap_Invoke(ExecutionSpace const& space, XVector const& X, - YVector const& Y) { +void Swap_Invoke(ExecutionSpace const& space, XVector const& X, YVector const& Y) { Kokkos::RangePolicy swap_policy(space, 0, X.extent(0)); swap_functor swap_func(X, Y); Kokkos::parallel_for("KokkosBlas::swap", swap_policy, swap_func); diff --git a/blas/impl/KokkosBlas1_swap_spec.hpp b/blas/impl/KokkosBlas1_swap_spec.hpp index db09a62f8f..749552a81c 100644 --- a/blas/impl/KokkosBlas1_swap_spec.hpp +++ b/blas/impl/KokkosBlas1_swap_spec.hpp @@ -44,15 +44,13 @@ struct swap_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_SWAP_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct swap_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct swap_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -64,34 +62,26 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - swap_eti_spec_avail::value> + bool tpl_spec_avail = swap_tpl_spec_avail::value, + bool eti_spec_avail = swap_eti_spec_avail::value> struct Swap { - static void swap(ExecutionSpace const& space, XVector const& X, - YVector const& Y); + static void swap(ExecutionSpace const& space, XVector const& X, YVector const& Y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Swap. template -struct Swap { - static void swap(ExecutionSpace const& space, XVector const& X, - YVector const& Y) { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::swap[ETI]" - : "KokkosBlas::swap[noETI]"); +struct Swap { + static void swap(ExecutionSpace const& space, XVector const& X, YVector const& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::swap[ETI]" + : "KokkosBlas::swap[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::swap<> ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(XVector).name(), - typeid(YVector).name()); + printf("KokkosBlas1::swap<> ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(XVector).name(), typeid(YVector).name()); else { - printf("KokkosBlas1::swap<> non-ETI specialization for < %s, %s, %s >\n", - typeid(ExecutionSpace).name(), typeid(XVector).name(), - typeid(YVector).name()); + printf("KokkosBlas1::swap<> non-ETI specialization for < %s, %s, %s >\n", typeid(ExecutionSpace).name(), + typeid(XVector).name(), typeid(YVector).name()); } #endif Swap_Invoke(space, X, Y); @@ -110,13 +100,11 @@ struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_DECL(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + extern template struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; // @@ -124,13 +112,11 @@ struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_INST(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/blas/impl/KokkosBlas1_team_abs_spec.hpp b/blas/impl/KokkosBlas1_team_abs_spec.hpp index bcd9545738..a5140a9b34 100644 --- a/blas/impl/KokkosBlas1_team_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_team_abs_spec.hpp @@ -32,24 +32,20 @@ struct team_abs_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamAbs { typedef Kokkos::ArithTraits ATS; - static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, - const XV& X); + static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X); }; template struct TeamAbs { typedef Kokkos::ArithTraits ATS; - static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, - const XV& X) { + static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X) { int N = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), - [&](const int& i) { R(i) = ATS::abs(X(i)); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { R(i) = ATS::abs(X(i)); }); } }; diff --git a/blas/impl/KokkosBlas1_team_axpby_spec.hpp b/blas/impl/KokkosBlas1_team_axpby_spec.hpp index 356be339c3..4cd42ae37d 100644 --- a/blas/impl/KokkosBlas1_team_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_team_axpby_spec.hpp @@ -33,24 +33,20 @@ struct team_axpby_tpl_spec_avail { // Unification and Specialization layer template ::value> + bool tpl_spec_avail = team_axpby_tpl_spec_avail::value> struct TeamAXPBY { - static KOKKOS_INLINE_FUNCTION void team_axpby( - const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const typename YVector::non_const_value_type& b, - const YVector& y); + static KOKKOS_INLINE_FUNCTION void team_axpby(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const typename YVector::non_const_value_type& b, + const YVector& y); }; template struct TeamAXPBY { - static KOKKOS_INLINE_FUNCTION void team_axpby( - const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const typename YVector::non_const_value_type& b, - const YVector& y) { + static KOKKOS_INLINE_FUNCTION void team_axpby(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const typename YVector::non_const_value_type& b, + const YVector& y) { const int N = x.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), - [&](const int& i) { y(i) = b * y(i) + a * x(i); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { y(i) = b * y(i) + a * x(i); }); } }; diff --git a/blas/impl/KokkosBlas1_team_dot_spec.hpp b/blas/impl/KokkosBlas1_team_dot_spec.hpp index 041920d109..5c5e4ea85d 100644 --- a/blas/impl/KokkosBlas1_team_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_team_dot_spec.hpp @@ -32,27 +32,20 @@ struct team_dot_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamDot { - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef typename IPT::dot_type dot_type; - static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, - const XV& X, const YV& Y); + static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, const XV& X, const YV& Y); }; template struct TeamDot { - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef typename IPT::dot_type dot_type; - static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, - const XV& X, const YV& Y) { + static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, const XV& X, const YV& Y) { dot_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( diff --git a/blas/impl/KokkosBlas1_team_mult_spec.hpp b/blas/impl/KokkosBlas1_team_mult_spec.hpp index 381802eeb0..6138257582 100644 --- a/blas/impl/KokkosBlas1_team_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_team_mult_spec.hpp @@ -33,25 +33,23 @@ struct team_mult_tpl_spec_avail { // Unification and Specialization layer template ::value> + bool tpl_spec_avail = team_mult_tpl_spec_avail::value> struct TeamMult { - static KOKKOS_INLINE_FUNCTION void team_mult( - const TeamType& team, const typename YVector::non_const_value_type& gamma, - const YVector& y, const typename AVector::non_const_value_type& alpha, - const AVector& a, const XVector& x); + static KOKKOS_INLINE_FUNCTION void team_mult(const TeamType& team, + const typename YVector::non_const_value_type& gamma, const YVector& y, + const typename AVector::non_const_value_type& alpha, const AVector& a, + const XVector& x); }; template struct TeamMult { - static KOKKOS_INLINE_FUNCTION void team_mult( - const TeamType& team, const typename YVector::non_const_value_type& gamma, - const YVector& y, const typename AVector::non_const_value_type& alpha, - const AVector& a, const XVector& x) { + static KOKKOS_INLINE_FUNCTION void team_mult(const TeamType& team, + const typename YVector::non_const_value_type& gamma, const YVector& y, + const typename AVector::non_const_value_type& alpha, const AVector& a, + const XVector& x) { const int N = x.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - y(i) = gamma * y(i) + alpha * a(i) * x(i); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const int& i) { y(i) = gamma * y(i) + alpha * a(i) * x(i); }); } }; diff --git a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp index ef050cb73b..bf486d88e8 100644 --- a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp @@ -32,31 +32,22 @@ struct team_nrm2_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamNrm2 { - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::mag_type mag_type; - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef Kokkos::ArithTraits AT; - static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, - const XV& X); + static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X); }; template struct TeamNrm2 { - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type>::mag_type mag_type; - typedef Kokkos::Details::InnerProductSpaceTraits< - typename XV::non_const_value_type> - IPT; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; + typedef Kokkos::Details::InnerProductSpaceTraits IPT; typedef Kokkos::ArithTraits AT; - static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, - const XV& X) { + static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X) { mag_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( diff --git a/blas/impl/KokkosBlas1_team_scal_impl.hpp b/blas/impl/KokkosBlas1_team_scal_impl.hpp index dc3aa4d42e..2ce2eece5e 100644 --- a/blas/impl/KokkosBlas1_team_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_team_scal_impl.hpp @@ -28,32 +28,22 @@ namespace Impl { /// ==================== struct TeamScaleInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), - [&](const int &i) { A[i * as0] *= alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { A[i * as0] *= alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (m > n) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - SerialScaleInternal::invoke(n, alpha, A + i * as0, as1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), + [&](const int &i) { SerialScaleInternal::invoke(n, alpha, A + i * as0, as1); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), [&](const int &j) { - SerialScaleInternal::invoke(m, alpha, A + j * as1, as0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &j) { SerialScaleInternal::invoke(m, alpha, A + j * as1, as0); }); } // member.team_barrier(); return 0; @@ -65,36 +55,25 @@ struct TeamScaleInternal { /// ======================== struct TeamVectorScaleInternal { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), - [&](const int &i) { A[i * as0] *= alpha; }); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { A[i * as0] *= alpha; }); // member.team_barrier(); return 0; } template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { if (as0 > as1) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); + }); } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, m), [&](const int &i) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, n), - [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) { A[i * as0 + j * as1] *= alpha; }); + }); } // member.team_barrier(); return 0; diff --git a/blas/impl/KokkosBlas1_team_scal_spec.hpp b/blas/impl/KokkosBlas1_team_scal_spec.hpp index ac6d36306a..3782fb4081 100644 --- a/blas/impl/KokkosBlas1_team_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_team_scal_spec.hpp @@ -32,22 +32,18 @@ struct team_scal_tpl_spec_avail { }; // Unification and Specialization layer -template ::value> +template ::value> struct TeamScal { - static KOKKOS_INLINE_FUNCTION void team_scal( - const TeamType& team, const RV& R, - const typename XV::non_const_value_type& a, const XV& X); + static KOKKOS_INLINE_FUNCTION void team_scal(const TeamType& team, const RV& R, + const typename XV::non_const_value_type& a, const XV& X); }; template struct TeamScal { - static KOKKOS_INLINE_FUNCTION void team_scal( - const TeamType& team, const RV& R, - const typename XV::non_const_value_type& a, const XV& X) { + static KOKKOS_INLINE_FUNCTION void team_scal(const TeamType& team, const RV& R, + const typename XV::non_const_value_type& a, const XV& X) { const int N = X.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), - [&](const int& i) { R(i) = a * X(i); }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { R(i) = a * X(i); }); } }; diff --git a/blas/impl/KokkosBlas1_team_update_spec.hpp b/blas/impl/KokkosBlas1_team_update_spec.hpp index 94a9221f4e..2fbf071d98 100644 --- a/blas/impl/KokkosBlas1_team_update_spec.hpp +++ b/blas/impl/KokkosBlas1_team_update_spec.hpp @@ -33,27 +33,24 @@ struct team_update_tpl_spec_avail { // Unification and Specialization layer template ::value> + bool tpl_spec_avail = team_update_tpl_spec_avail::value> struct TeamUpdate { - static KOKKOS_INLINE_FUNCTION void team_update( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const XVector& x, const typename YVector::non_const_value_type& beta, - const YVector& y, const typename ZVector::non_const_value_type& gamma, - const ZVector& z); + static KOKKOS_INLINE_FUNCTION void team_update(const TeamType& team, + const typename XVector::non_const_value_type& alpha, const XVector& x, + const typename YVector::non_const_value_type& beta, const YVector& y, + const typename ZVector::non_const_value_type& gamma, const ZVector& z); }; template struct TeamUpdate { - static KOKKOS_INLINE_FUNCTION void team_update( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const XVector& x, const typename YVector::non_const_value_type& beta, - const YVector& y, const typename ZVector::non_const_value_type& gamma, - const ZVector& z) { + static KOKKOS_INLINE_FUNCTION void team_update(const TeamType& team, + const typename XVector::non_const_value_type& alpha, const XVector& x, + const typename YVector::non_const_value_type& beta, const YVector& y, + const typename ZVector::non_const_value_type& gamma, + const ZVector& z) { const int N = x.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - z(i) = gamma * z(i) + alpha * x(i) + beta * y(i); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const int& i) { z(i) = gamma * z(i) + alpha * x(i) + beta * y(i); }); } }; diff --git a/blas/impl/KokkosBlas1_update_impl.hpp b/blas/impl/KokkosBlas1_update_impl.hpp index 96aca5c70e..31502bee8b 100644 --- a/blas/impl/KokkosBlas1_update_impl.hpp +++ b/blas/impl/KokkosBlas1_update_impl.hpp @@ -40,8 +40,8 @@ namespace Impl { // corresponding input coefficient. Any literal coefficient of zero // has BLAS semantics of ignoring the corresponding (multi)vector // entry. -template +template struct MV_Update_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -54,19 +54,10 @@ struct MV_Update_Functor { const typename ZMV::non_const_value_type gamma_; ZMV Z_; - MV_Update_Functor(const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z) - : numCols(X.extent(1)), - alpha_(alpha), - X_(X), - beta_(beta), - Y_(Y), - gamma_(gamma), - Z_(Z) { + MV_Update_Functor(const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z) + : numCols(X.extent(1)), alpha_(alpha), X_(X), beta_(beta), Y_(Y), gamma_(gamma), Z_(Z) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Functor: X is not a Kokkos::View."); @@ -76,17 +67,15 @@ struct MV_Update_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Functor: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::MV_Update_Functor: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting enum values to int avoids compiler warnings about // comparing different kinds of enum values. - static_assert( - (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, - "KokkosBlas::Impl::MV_Update_Functor: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, + "KokkosBlas::Impl::MV_Update_Functor: " + "X, Y, and Z must have the same rank."); static_assert(ZMV::rank == 2, "KokkosBlas::Impl::MV_Update_Functor: " "XMV, YMV, and ZMV must have rank 2."); @@ -209,8 +198,8 @@ struct MV_Update_Functor { // coefficients. The value 2 tells the functor to use the // corresponding input coefficient. Any literal coefficient of zero // has BLAS semantics of ignoring the corresponding vector entry. -template +template struct V_Update_Functor { typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -226,13 +215,7 @@ struct V_Update_Functor { V_Update_Functor(const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, const typename ZV::non_const_value_type& gamma, const ZV& Z) - : numCols(X.extent(1)), - alpha_(alpha), - X_(X), - beta_(beta), - Y_(Y), - gamma_(gamma), - Z_(Z) { + : numCols(X.extent(1)), alpha_(alpha), X_(X), beta_(beta), Y_(Y), gamma_(gamma), Z_(Z) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Functor: X is not a Kokkos::View."); @@ -242,17 +225,15 @@ struct V_Update_Functor { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Functor: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Update_Functor: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing // different kinds of enum values. - static_assert( - (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, - "KokkosBlas::Impl::V_Update_Functor: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, + "KokkosBlas::Impl::V_Update_Functor: " + "X, Y, and Z must have the same rank."); static_assert(ZV::rank == 1, "KokkosBlas::Impl::V_Update_Functor: " "XV, YV, and ZV must have rank 1."); @@ -314,15 +295,10 @@ struct V_Update_Functor { // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding multivector entry. -template -void MV_Update_Generic(const execution_space& space, - const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z, int a = 2, int b = 2, int c = 2) { +template +void MV_Update_Generic(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z, int a = 2, int b = 2, int c = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Generic: X is not a Kokkos::View."); @@ -332,17 +308,15 @@ void MV_Update_Generic(const execution_space& space, static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Update_Generic: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::MV_Update_Generic: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing different // kinds of enum values. - static_assert( - (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, - "KokkosBlas::Impl::MV_Update_Generic: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, + "KokkosBlas::Impl::MV_Update_Generic: " + "X, Y, and Z must have the same rank."); static_assert(ZMV::rank == 2, "KokkosBlas::Impl::MV_Update_Generic: " "XMV, YMV, and ZMV must have rank 2."); @@ -353,22 +327,18 @@ void MV_Update_Generic(const execution_space& space, if (a == 0) { if (b == 0) { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } else { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } @@ -379,22 +349,18 @@ void MV_Update_Generic(const execution_space& space, else { if (b == 0) { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } else { if (c == 0) { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - MV_Update_Functor op(alpha, X, beta, - Y, gamma, Z); + MV_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } @@ -417,13 +383,9 @@ void MV_Update_Generic(const execution_space& space, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding vector entry. template -void V_Update_Generic(const execution_space& space, - const typename XV::non_const_value_type& alpha, - const XV& X, - const typename YV::non_const_value_type& beta, - const YV& Y, - const typename ZV::non_const_value_type& gamma, - const ZV& Z, int a = 2, int b = 2, int c = 2) { +void V_Update_Generic(const execution_space& space, const typename XV::non_const_value_type& alpha, const XV& X, + const typename YV::non_const_value_type& beta, const YV& Y, + const typename ZV::non_const_value_type& gamma, const ZV& Z, int a = 2, int b = 2, int c = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Generic: X is not a Kokkos::View."); @@ -433,17 +395,15 @@ void V_Update_Generic(const execution_space& space, static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Update_Generic: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::V_Update_Generic: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing // different kinds of enum values. - static_assert( - (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, - "KokkosBlas::Impl::V_Update_Generic: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, + "KokkosBlas::Impl::V_Update_Generic: " + "X, Y, and Z must have the same rank."); static_assert(ZV::rank == 1, "KokkosBlas::Impl::V_Update_Generic: " "XV, YV, and ZV must have rank 1."); @@ -454,22 +414,18 @@ void V_Update_Generic(const execution_space& space, if (a == 0) { if (b == 0) { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,0,0>", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,0,c>", policy, op); } } else { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,b,0>", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update<0,b,c>", policy, op); } } @@ -480,22 +436,18 @@ void V_Update_Generic(const execution_space& space, else { if (b == 0) { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } else { if (c == 0) { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } else { - V_Update_Functor op(alpha, X, beta, Y, - gamma, Z); + V_Update_Functor op(alpha, X, beta, Y, gamma, Z); Kokkos::parallel_for("KokkosBlas::update", policy, op); } } diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index 9a54888012..b031a529b8 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -27,8 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_eti_spec_avail { enum : bool { value = false }; }; @@ -42,21 +41,17 @@ struct update_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_UPDATE_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct update_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct update_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -66,21 +61,17 @@ struct update_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct update_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 2> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct update_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 2> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -103,39 +94,27 @@ namespace Impl { /// Z(i,j) = alpha*X(i,j) + beta*Y(i,j) + gamma*Z(i,j), /// /// with special cases for alpha, beta, or gamma = 0. -template ::value, - bool eti_spec_avail = - update_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = update_eti_spec_avail::value> struct Update { - static void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z); + static void update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for XMV, YMV, and ZMV rank-2 Views. template -struct Update { +struct Update { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; - static void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, - const XMV& X, - const typename YMV::non_const_value_type& beta, - const YMV& Y, - const typename ZMV::non_const_value_type& gamma, - const ZMV& Z) { + static void update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Update::update: X is not a Kokkos::View."); @@ -145,32 +124,28 @@ struct Update::value, "KokkosBlas::Impl::" "Update::update: Z is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Update::update: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); // Casting to int avoids compiler warnings about comparing // different kinds of enum values. - static_assert( - (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, - "KokkosBlas::Impl::Update::update: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank, + "KokkosBlas::Impl::Update::update: " + "X, Y, and Z must have the same rank."); static_assert(ZMV::rank == 2, "KokkosBlas::Impl::Update::update: " "XMV, YMV, and ZMV must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::update[ETI]" - : "KokkosBlas::update[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::update[ETI]" + : "KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", - typeid(XMV).name(), typeid(YMV).name(), typeid(ZMV).name()); + printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", typeid(XMV).name(), typeid(YMV).name(), + typeid(ZMV).name()); else { - printf( - "KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", - typeid(XMV).name(), typeid(YMV).name(), typeid(ZMV).name()); + printf("KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", typeid(XMV).name(), + typeid(YMV).name(), typeid(ZMV).name()); } #endif @@ -203,24 +178,20 @@ struct Update(INT_MAX)) { typedef int index_type; - V_Update_Generic(space, alpha, X_0, beta, - Y_0, gamma, Z_0, a, b, c); + V_Update_Generic( + space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c); } else { typedef typename XMV::size_type index_type; - V_Update_Generic(space, alpha, X_0, beta, - Y_0, gamma, Z_0, a, b, c); + V_Update_Generic( + space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c); } } else { if (numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + MV_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XMV::size_type index_type; - MV_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + MV_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } } Kokkos::Profiling::popRegion(); @@ -229,19 +200,15 @@ struct Update -struct Update { +struct Update { typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; - static void update(const execution_space& space, - const typename XV::non_const_value_type& alpha, - const XV& X, const typename YV::non_const_value_type& beta, - const YV& Y, - const typename ZV::non_const_value_type& gamma, - const ZV& Z) { + static void update(const execution_space& space, const typename XV::non_const_value_type& alpha, const XV& X, + const typename YV::non_const_value_type& beta, const YV& Y, + const typename ZV::non_const_value_type& gamma, const ZV& Z) { // XV, YV, and ZV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -253,29 +220,25 @@ struct Update::update: Z is not a Kokkos::View."); // ZV must be nonconst (else it can't be an output argument). - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::Impl::Update::update: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert( - (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, - "KokkosBlas::Impl::Update::update: " - "X, Y, and Z must have the same rank."); + static_assert((int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank, + "KokkosBlas::Impl::Update::update: " + "X, Y, and Z must have the same rank."); static_assert(ZV::rank == 1, "KokkosBlas::Impl::Update::update: " "XV, YV, and ZV must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::update[ETI]" - : "KokkosBlas::update[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::update[ETI]" + : "KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) - printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", - typeid(XV).name(), typeid(YV).name(), typeid(ZV).name()); + printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n", typeid(XV).name(), typeid(YV).name(), + typeid(ZV).name()); else { - printf( - "KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", - typeid(XV).name(), typeid(YV).name(), typeid(ZV).name()); + printf("KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n", typeid(XV).name(), + typeid(YV).name(), typeid(ZV).name()); } #endif @@ -299,15 +262,12 @@ struct Update(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { + if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - V_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XV::size_type index_type; - V_Update_Generic( - space, alpha, X, beta, Y, gamma, Z, a, b, c); + V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } Kokkos::Profiling::popRegion(); } @@ -326,32 +286,24 @@ struct Update, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; -#define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Update< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 1, false, true>; // @@ -362,32 +314,24 @@ struct Update, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; -#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template struct Update< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct Update< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ 2, false, true>; #include diff --git a/blas/impl/KokkosBlas2_gemv_impl.hpp b/blas/impl/KokkosBlas2_gemv_impl.hpp index dc0f531583..b1976e2622 100644 --- a/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -29,35 +29,26 @@ namespace Impl { template struct SingleLevelNontransposeGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using y_value_type = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; - - SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, - const YViewType& y) + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; + + SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, + const BetaCoeffType& beta, const YViewType& y) : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); static_assert(alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1, "Invalid alphaPreset value; valid values are 0, 1, and -1."); static_assert(betaPreset == 0 || betaPreset == 1 || betaPreset == -1, @@ -112,43 +103,29 @@ struct SingleLevelNontransposeGEMV { template struct SingleLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; typedef AccumScalar value_type[]; IndexType value_count; // Kokkos needs this for reductions w/ array results - SingleLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, - const YViewType& y) - : value_count(A.extent(1)), - alpha_(alpha), - A_(A), - x_(x), - beta_(beta), - y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + SingleLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, + const BetaCoeffType& beta, const YViewType& y) + : value_count(A.extent(1)), alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); static_assert(alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1, "Invalid alphaPreset value; valid values are 0, 1, and -1."); static_assert(betaPreset == 0 || betaPreset == 1 || betaPreset == -1, @@ -178,8 +155,7 @@ struct SingleLevelTransposeGEMV { } } - KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i, - value_type y_cur) const { + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i, value_type y_cur) const { using Kokkos::ArithTraits; using KAT = ArithTraits; @@ -199,27 +175,18 @@ struct SingleLevelTransposeGEMV { }; // Single-level parallel version of GEMV. -template -void singleLevelGemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template +void singleLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer"); using y_value_type = typename YViewType::non_const_value_type; using policy_type = Kokkos::RangePolicy; @@ -242,12 +209,9 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", - policy_type(0, A.extent(1)), functor); + Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", policy_type(0, A.extent(1)), functor); } return; } @@ -260,49 +224,35 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } } else if (alpha == Kokkos::ArithTraits::one()) { if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } } else { // alpha != 0 and alpha != 1 if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } @@ -315,58 +265,37 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else if (alpha == Kokkos::ArithTraits::one()) { if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else { // alpha != 0 and alpha != 1 if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } } else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') { // conj xpose @@ -377,58 +306,37 @@ void singleLevelGemv(const ExecutionSpace& space, const char trans[], } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else if (alpha == Kokkos::ArithTraits::one()) { if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } else { // alpha != 0 and alpha != 1 if (beta == Kokkos::ArithTraits::zero()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else if (beta == Kokkos::ArithTraits::one()) { - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } else { // beta != 0 && beta != 1 - using functor_type = - SingleLevelTransposeGEMV; + using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, - functor); + Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } } } @@ -440,38 +348,29 @@ struct TwoLevelGEMV_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- // Functor for a two-level parallel_reduce version of GEMV (non-transpose), // designed for performance on GPU. Kernel depends on the layout of A. -template +template struct TwoLevelGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, + TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, const YViewType& y) : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); } public: @@ -480,15 +379,12 @@ struct TwoLevelGEMV { // -Groups of 32 threads handle N/teamsize columns sequentially, placing // results into shared. -Then individual thread results are combined with // parallel_reduce. - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutLeftTag, const member_type& team) const { using KAT = Kokkos::ArithTraits; using AKAT = Kokkos::ArithTraits; // Allocate a Scalar in shared for each thread - AccumScalar* blockResult = - (AccumScalar*)team.team_shmem().get_shmem(32 * sizeof(AccumScalar)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), - [&](int i) { blockResult[i] = AKAT::zero(); }); + AccumScalar* blockResult = (AccumScalar*)team.team_shmem().get_shmem(32 * sizeof(AccumScalar)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), [&](int i) { blockResult[i] = AKAT::zero(); }); team.team_barrier(); // Which block this thread will work on int block = team.team_rank() / 32; @@ -498,9 +394,7 @@ struct TwoLevelGEMV { AccumScalar localSum = AKAT::zero(); // compute local sum if (row < (IndexType)A_.extent(0)) { - for (IndexType col = blockColStart; - col < blockColStart + columnsPerThread && col < A_.extent(1); - col++) { + for (IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++) { // A access is coalesced, x access is a broadcast localSum += AccumScalar(A_(row, col)) * AccumScalar(x_(col)); } @@ -514,15 +408,13 @@ struct TwoLevelGEMV { if (beta_ == KAT::zero()) y_(yrow) = y_value_type(alpha_ * blockResult[i]); else - y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + - alpha_ * blockResult[i]); + y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + alpha_ * blockResult[i]); } }); } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutRightTag, const member_type& team) const { using KAT = Kokkos::ArithTraits; const IndexType N = A_.extent(1); @@ -532,10 +424,7 @@ struct TwoLevelGEMV { AccumScalar val; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, N), - [&](const int j, AccumScalar& update) { - update += AccumScalar(A_(i, j)) * x_(j); - }, - val); + [&](const int j, AccumScalar& update) { update += AccumScalar(A_(i, j)) * x_(j); }, val); // compute yj = beta*yj + alpha*val Kokkos::single(Kokkos::PerTeam(team), [&]() { @@ -561,39 +450,29 @@ struct TwoLevelGEMV { // transpose GEMV. The functor uses parallel-for over the columns of the input // matrix A and each team uses parallel-reduce over the row of its column. // The output vector y is the reduction result. -template struct TwoLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, - const XViewType& x, const BetaCoeffType& beta, + TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, const YViewType& y) : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer."); } public: @@ -634,27 +513,18 @@ struct TwoLevelTransposeGEMV { }; // Two-level parallel version of GEMV. -template -void twoLevelGemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template +void twoLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(std::is_integral::value, "IndexType must be an integer"); using y_value_type = typename YViewType::non_const_value_type; using team_policy_type = Kokkos::TeamPolicy; @@ -681,40 +551,33 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. - using functor_type = - SingleLevelNontransposeGEMV; + using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", - range_policy_type(space, 0, y.extent(0)), functor); + Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range_policy_type(space, 0, y.extent(0)), functor); } return; } if (tr == 'N') { - constexpr bool isLayoutLeft = std::is_same::value; + constexpr bool isLayoutLeft = std::is_same::value; // Both kernels work for both layouts - the only difference is access // pattern. using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using tagged_policy = Kokkos::TeamPolicy; - using functor_type = TwoLevelGEMV; + using functor_type = TwoLevelGEMV; functor_type functor(alpha, A, x, beta, y); tagged_policy team; if constexpr (isLayoutLeft) { - using AccumScalar = typename std::conditional< - std::is_same::value || - std::is_same::value, - float, y_value_type>::type; + using AccumScalar = + typename std::conditional::value || + std::is_same::value, + float, y_value_type>::type; size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; tagged_policy temp(space, 1, 1); temp.set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); - int teamSize = - temp.team_size_recommended(functor, Kokkos::ParallelForTag()); + int teamSize = temp.team_size_recommended(functor, Kokkos::ParallelForTag()); // make sure teamSize is a multiple of 32 teamSize -= teamSize % 32; // don't make teamSize larger than what's useful @@ -728,8 +591,7 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], #endif int numBlocks = teamSize / 32; functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks; - team = tagged_policy(space, numTeams, teamSize) - .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); + team = tagged_policy(space, numTeams, teamSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); } else { // LayoutRight: one team per row team = tagged_policy(space, A.extent(0), Kokkos::AUTO); @@ -744,21 +606,15 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], } else if (tr == 'T') { // transpose, and not conj transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = - TwoLevelTransposeGEMV; + using functor_type = TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, - functor); + Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); } else if (tr == 'C' || tr == 'H') { // conjugate transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = - TwoLevelTransposeGEMV; + using functor_type = TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); - Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, - functor); + Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); } } } @@ -766,26 +622,18 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], // generalGemv: use 1 level (Range) or 2 level (Team) implementation, // depending on whether execution space is CPU or GPU. enable_if makes sure // unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalGemvImpl(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template ()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { singleLevelGemv(space, trans, alpha, A, x, beta, y); } -template ()>::type* = nullptr> -void generalGemvImpl(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, +template ()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { twoLevelGemv(space, trans, alpha, A, x, beta, y); } diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index 97e6e2717e..05e2d28bc7 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -41,19 +41,16 @@ struct gemv_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct gemv_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct gemv_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -68,47 +65,32 @@ namespace Impl { // // Implementation of KokkosBlas::gemv. -template < - class ExecutionSpace, class AViewType, class XViewType, class YViewType, - bool tpl_spec_avail = gemv_tpl_spec_avail::value, - bool eti_spec_avail = gemv_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = gemv_eti_spec_avail::value> struct GEMV { - static void gemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, + static void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::gemv[ETI]" - : "KokkosBlas::gemv[noETI]"); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::gemv[ETI]" + : "KokkosBlas::gemv[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); // Prefer int as the index type, but use a larger type if needed. - if (numRows < static_cast(INT_MAX) && - numCols < static_cast(INT_MAX)) { - generalGemvImpl( - space, trans, alpha, A, x, beta, y); + if (numRows < static_cast(INT_MAX) && numCols < static_cast(INT_MAX)) { + generalGemvImpl(space, trans, alpha, A, x, beta, y); } else { - generalGemvImpl( - space, trans, alpha, A, x, beta, y); + generalGemvImpl(space, trans, alpha, A, x, beta, y); } Kokkos::Profiling::popRegion(); } @@ -129,30 +111,24 @@ struct GEMV { // one or more .cpp files. // -#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct GEMV< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct GEMV< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct GEMV< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct GEMV< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index 651db7f11a..94eb1868f9 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -34,8 +34,8 @@ struct ThreadParallelGER { using YComponentType = typename YViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - ThreadParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + ThreadParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } @@ -53,9 +53,7 @@ struct ThreadParallelGER { } } else { for (IndexType j = 0; j < N; ++j) { - A_(i, j) += - AComponentType(alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); } } } @@ -70,14 +68,12 @@ struct ThreadParallelGER { }; // Thread parallel version of GER. -template +template void threadParallelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, + const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -88,12 +84,10 @@ void threadParallelGer(const ExecutionSpace& space, const char trans[], } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - Kokkos::RangePolicy rangePolicy(space, 0, - A.extent(0)); - ThreadParallelGER functor( - (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, - functor); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + ThreadParallelGER functor((trans[0] == 'T') || (trans[0] == 't'), alpha, + x, y, A); + Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, functor); } } @@ -104,8 +98,7 @@ struct TeamParallelGER_LayoutRightTag {}; // Functor for the team parallel version of GER, designed for // performance on GPU. The kernel depends on the layout of A. -template +template struct TeamParallelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -115,16 +108,15 @@ struct TeamParallelGER { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TeamParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + TeamParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutLeftTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -132,24 +124,18 @@ struct TeamParallelGER { const IndexType j(team.league_rank()); if (justTranspose_) { const YComponentType y_fixed(y_(j)); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), + [&](const IndexType& i) { A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); }); } else { - const YComponentType y_fixed( - Kokkos::ArithTraits::conj(y_(j))); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - }); + const YComponentType y_fixed(Kokkos::ArithTraits::conj(y_(j))); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), + [&](const IndexType& i) { A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); }); } } } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutRightTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -157,17 +143,12 @@ struct TeamParallelGER { const IndexType i(team.league_rank()); const XComponentType x_fixed(x_(i)); if (justTranspose_) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const IndexType& j) { A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); + }); } } } @@ -181,14 +162,11 @@ struct TeamParallelGER { }; // Team parallel version of GER. -template -void teamParallelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void teamParallelGer(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -203,11 +181,9 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], return; } - constexpr bool isLayoutLeft = - std::is_same::value; + constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -218,8 +194,8 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TeamParallelGER - functor((trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); + TeamParallelGER functor( + (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); Kokkos::parallel_for("KokkosBlas::ger[teamParallel]", teamPolicy, functor); } @@ -231,25 +207,17 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], // // The 'enable_if' makes sure unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalGerImpl(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { threadParallelGer(space, trans, alpha, x, y, A); } -template ()>::type* = nullptr> -void generalGerImpl(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { teamParallelGer(space, trans, alpha, x, y, A); } diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index 9802194b98..04e25ab422 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -40,19 +40,16 @@ struct ger_eti_spec_avail { // specializations go in this header file. We may spread out definitions (see // _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct ger_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -67,34 +64,26 @@ namespace Impl { // // Implementation of KokkosBlas::ger. -template ::value, - bool eti_spec_avail = ger_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = ger_eti_spec_avail::value> struct GER { - static void ger(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, + static void ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::ger[ETI]" - : "KokkosBlas::ger[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::ger[ETI]" + : "KokkosBlas::ger[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); // Prefer int as the index type, but use a larger type if needed. - if ((numRows < static_cast(INT_MAX)) && - (numCols < static_cast(INT_MAX))) { - generalGerImpl( - space, trans, alpha, x, y, A); + if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { + generalGerImpl(space, trans, alpha, x, y, A); } else { - generalGerImpl( - space, trans, alpha, x, y, A); + generalGerImpl(space, trans, alpha, x, y, A); } Kokkos::Profiling::popRegion(); @@ -115,30 +104,24 @@ struct GER { // We may spread out definitions (see _DEF macro below) across one or more .cpp // files. // -#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_serial_gemv_impl.hpp b/blas/impl/KokkosBlas2_serial_gemv_impl.hpp index 1fec8769cb..79f49fdd0e 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_impl.hpp @@ -25,13 +25,9 @@ namespace KokkosBlas { template struct SerialGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, - const AViewType & /*A*/, - const xViewType & /*x*/, - const ScalarType /*beta*/, - const yViewType & /*y*/); + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/, const AViewType & /*A*/, const xViewType & /*x*/, + const ScalarType /*beta*/, const yViewType & /*y*/); }; } // namespace KokkosBlas @@ -49,27 +45,21 @@ namespace KokkosBlas { /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); } /// @@ -77,27 +67,21 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), - x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); } /// @@ -105,27 +89,21 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - Impl::OpConj(), A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(Impl::OpConj(), A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { - return Impl::SerialGemvInternal::invoke( - Impl::OpConj(), A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { + return Impl::SerialGemvInternal::invoke(Impl::OpConj(), A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } } // namespace KokkosBlas diff --git a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp index aa7efc9122..1b70413119 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp @@ -41,24 +41,17 @@ struct InnerMultipleDotProduct { const int _as0, _as1, _xs0, _ys0; KOKKOS_INLINE_FUNCTION - InnerMultipleDotProduct(const int as0, const int as1, const int xs0, - const int ys0) + InnerMultipleDotProduct(const int as0, const int as1, const int xs0, const int ys0) : _as0(as0), _as1(as1), _xs0(xs0), _ys0(ys0) {} - template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, - const int n, + template + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, ValueYType *KOKKOS_RESTRICT y); - template - KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, - const int m, const int n, + template + KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, ValueYType *KOKKOS_RESTRICT y); }; @@ -67,16 +60,14 @@ struct InnerMultipleDotProduct { /// ==================== template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; - const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, - i4 = 4 * _as0; + const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0, i4 = 4 * _as0; // unroll by rows ValueYType y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0, y_4 = 0; @@ -105,12 +96,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (!n) return 0; OpA op; @@ -141,12 +131,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; OpA op; @@ -175,12 +164,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; OpA op; @@ -207,12 +195,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int n, + ValueYType *KOKKOS_RESTRICT y) { if (n <= 0) return 0; OpA op; @@ -230,12 +217,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 5: { @@ -268,12 +254,11 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke( } template <> -template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +template +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 4: { @@ -301,13 +286,12 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke( } template <> -template +template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 3: { @@ -330,13 +314,12 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke( } template <> -template +template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 2: { @@ -354,13 +337,12 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke( } template <> -template +template -KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke( - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const ValueXType *KOKKOS_RESTRICT x, const int m, const int n, - ValueYType *KOKKOS_RESTRICT y) { +KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, + const ValueXType *KOKKOS_RESTRICT x, const int m, + const int n, ValueYType *KOKKOS_RESTRICT y) { if (m <= 0 || n <= 0) return 0; switch (m) { case 1: { diff --git a/blas/impl/KokkosBlas2_serial_gemv_internal.hpp b/blas/impl/KokkosBlas2_serial_gemv_internal.hpp index 2d78102c7a..912972c7ee 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_internal.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_internal.hpp @@ -31,33 +31,27 @@ namespace Impl { template struct SerialGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - OpA op, const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(OpA op, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); // default OpA = OpID - template - KOKKOS_INLINE_FUNCTION static int invoke( - const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { return invoke(OpID(), m, n, alpha, A, as0, as1, x, xs0, beta, y, ys0); } }; template <> -template +template KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( - OpA op, const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + OpA op, const int m, const int n, const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -91,12 +85,10 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( } template <> -template +template KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( - OpA /* op */, const int m, const int n, const ScalarType alpha, - const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + OpA /* op */, const int m, const int n, const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -116,8 +108,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( Impl::InnerMultipleDotProduct inner(as0, as1, xs0, ys0); const int mb = mbAlgo; for (int i = 0; i < m; i += mb) - inner.serial_invoke(alpha, A + i * as0, x, - (i + mb) > m ? (m - i) : mb, n, y + i * ys0); + inner.serial_invoke(alpha, A + i * as0, x, (i + mb) > m ? (m - i) : mb, n, y + i * ys0); } return 0; } diff --git a/blas/impl/KokkosBlas2_syr2_impl.hpp b/blas/impl/KokkosBlas2_syr2_impl.hpp index 69284e9547..7bcb0069ab 100644 --- a/blas/impl/KokkosBlas2_syr2_impl.hpp +++ b/blas/impl/KokkosBlas2_syr2_impl.hpp @@ -27,16 +27,14 @@ namespace Impl { // Functor for the thread parallel version of SYR2. // This functor parallelizes over rows of the input matrix A. -template +template struct ThreadParallelSYR2 { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using YComponentType = typename YViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - ThreadParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) + ThreadParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, const AViewType& A) : alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } @@ -55,16 +53,14 @@ struct ThreadParallelSYR2 { if constexpr (tJustTranspose) { if (x_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); } } } if (y_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); } } @@ -72,21 +68,16 @@ struct ThreadParallelSYR2 { } else { if (x_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); } } } if (y_fixed != Kokkos::ArithTraits::zero()) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - Kokkos::ArithTraits::conj(alpha_) * y_fixed * - Kokkos::ArithTraits::conj(x_(j))); + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(Kokkos::ArithTraits::conj(alpha_) * y_fixed * + Kokkos::ArithTraits::conj(x_(j))); } } } @@ -102,14 +93,11 @@ struct ThreadParallelSYR2 { }; // Thread parallel version of SYR2. -template -void threadParallelSyr2(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void threadParallelSyr2(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -120,13 +108,9 @@ void threadParallelSyr2(const ExecutionSpace& space, } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - Kokkos::RangePolicy rangePolicy(space, 0, - A.extent(0)); - ThreadParallelSYR2 - functor(alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::syr2[threadParallel]", rangePolicy, - functor); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + ThreadParallelSYR2 functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[threadParallel]", rangePolicy, functor); } } @@ -137,8 +121,8 @@ struct TeamParallelSYR2_LayoutRightTag {}; // Functor for the team parallel version of SYR2, designed for // performance on GPUs. The kernel depends on the layout of A. -template +template struct TeamParallelSYR2 { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -148,16 +132,14 @@ struct TeamParallelSYR2 { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TeamParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) + TeamParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, const YViewType& y, const AViewType& A) : alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutLeftTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -171,47 +153,35 @@ struct TeamParallelSYR2 { const XComponentType x_fixed(x_(j)); const YComponentType y_fixed(y_(j)); if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); } if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * y_(i) * x_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_(i) * x_fixed); + } + }); } } else { - const XComponentType x_fixed( - Kokkos::ArithTraits::conj(x_(j))); - const YComponentType y_fixed( - Kokkos::ArithTraits::conj(y_(j))); + const XComponentType x_fixed(Kokkos::ArithTraits::conj(x_(j))); + const YComponentType y_fixed(Kokkos::ArithTraits::conj(y_(j))); if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); } if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - Kokkos::ArithTraits::conj(alpha_) * - y_(i) * x_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(Kokkos::ArithTraits::conj(alpha_) * y_(i) * x_fixed); + } + }); } } } @@ -219,8 +189,7 @@ struct TeamParallelSYR2 { } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutRightTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { @@ -234,46 +203,34 @@ struct TeamParallelSYR2 { const YComponentType y_fixed(y_(i)); if constexpr (tJustTranspose) { if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + }); } if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + }); } } else { if (x_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(y_(j))); + } + }); } if (y_fixed != Kokkos::ArithTraits::zero()) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - Kokkos::ArithTraits::conj(alpha_) * - y_fixed * - Kokkos::ArithTraits::conj(x_(j))); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(Kokkos::ArithTraits::conj(alpha_) * y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); } } } @@ -288,14 +245,11 @@ struct TeamParallelSYR2 { }; // Team parallel version of SYR2. -template -void teamParallelSyr2(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void teamParallelSyr2(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -310,11 +264,9 @@ void teamParallelSyr2(const ExecutionSpace& space, return; } - constexpr bool isLayoutLeft = - std::is_same::value; + constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -325,9 +277,8 @@ void teamParallelSyr2(const ExecutionSpace& space, teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TeamParallelSYR2 - functor(alpha, x, y, A); + TeamParallelSYR2 functor( + alpha, x, y, A); Kokkos::parallel_for("KokkosBlas::syr2[teamParallel]", teamPolicy, functor); } @@ -339,28 +290,22 @@ void teamParallelSyr2(const ExecutionSpace& space, // // The 'enable_if' makes sure unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalSyr2Impl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - threadParallelSyr2(space, alpha, x, y, A); +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + threadParallelSyr2(space, alpha, + x, y, A); } -template ()>::type* = nullptr> -void generalSyr2Impl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { - teamParallelSyr2(space, alpha, x, y, A); +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + teamParallelSyr2(space, alpha, x, + y, A); } } // namespace Impl diff --git a/blas/impl/KokkosBlas2_syr2_spec.hpp b/blas/impl/KokkosBlas2_syr2_spec.hpp index 01637ba1d4..a8ae741ede 100644 --- a/blas/impl/KokkosBlas2_syr2_spec.hpp +++ b/blas/impl/KokkosBlas2_syr2_spec.hpp @@ -40,19 +40,16 @@ struct syr2_eti_spec_avail { // specializations go in this header file. We may spread out definitions (see // _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr2_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -67,22 +64,17 @@ namespace Impl { // // Implementation of KokkosBlas::syr2. -template < - class ExecutionSpace, class XViewType, class YViewType, class AViewType, - bool tpl_spec_avail = syr2_tpl_spec_avail::value, - bool eti_spec_avail = syr2_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = syr2_eti_spec_avail::value> struct SYR2 { - static void syr2(const ExecutionSpace& space, const char trans[], - const char uplo[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + static void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::syr2[ETI]" - : "KokkosBlas::syr2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr2[ETI]" + : "KokkosBlas::syr2[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); @@ -92,41 +84,33 @@ struct SYR2 { bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); // Prefer int as the index type, but use a larsyr2 type if needed. - if ((numRows < static_cast(INT_MAX)) && - (numCols < static_cast(INT_MAX))) { + if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { if (justTranspose) { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } } else { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } } } else { if (justTranspose) { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } } else { if (justUp) { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, A); } else { - generalSyr2Impl(space, alpha, x, y, A); + generalSyr2Impl(space, alpha, x, y, + A); } } } @@ -149,30 +133,24 @@ struct SYR2 { // We may spread out definitions (see _DEF macro below) across one or more .cpp // files. // -#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_SYR2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 685ca75997..7685fd4b4b 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -27,16 +27,13 @@ namespace Impl { // Functor for the thread parallel version of SYR. // This functor parallelizes over rows of the input matrix A. -template +template struct ThreadParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - ThreadParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, - const AViewType& A) - : alpha_(alpha), x_(x), A_(A) { + ThreadParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, const AViewType& A) : alpha_(alpha), x_(x), A_(A) { // Nothing to do } @@ -50,18 +47,14 @@ struct ThreadParallelSYR { if constexpr (tJustTranspose) { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); } } } else { for (IndexType j = 0; j < N; ++j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(x_(j))); + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(x_(j))); } } } @@ -75,13 +68,10 @@ struct ThreadParallelSYR { }; // Thread parallel version of SYR. -template -void threadParallelSyr(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, +template +void threadParallelSyr(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -90,12 +80,9 @@ void threadParallelSyr(const ExecutionSpace& space, } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - Kokkos::RangePolicy rangePolicy(space, 0, - A.extent(0)); - ThreadParallelSYR - functor(alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[threadParallel]", rangePolicy, - functor); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + ThreadParallelSYR functor(alpha, x, A); + Kokkos::parallel_for("KokkosBlas::syr[threadParallel]", rangePolicy, functor); } } @@ -106,8 +93,7 @@ struct TeamParallelSYR_LayoutRightTag {}; // Functor for the team parallel version of SYR, designed for // performance on GPUs. The kernel depends on the layout of A. -template +template struct TeamParallelSYR { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; @@ -116,16 +102,13 @@ struct TeamParallelSYR { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TeamParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, - const AViewType& A) - : alpha_(alpha), x_(x), A_(A) { + TeamParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, const AViewType& A) : alpha_(alpha), x_(x), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutLeftTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutLeftTag, const member_type& team) const { // Condition 'alpha_ == zero' has already been checked const IndexType j(team.league_rank()); if (x_(j) == Kokkos::ArithTraits::zero()) { @@ -134,30 +117,24 @@ struct TeamParallelSYR { const IndexType M(A_.extent(0)); if constexpr (tJustTranspose) { const XComponentType x_fixed(x_(j)); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); } else { - const XComponentType x_fixed( - Kokkos::ArithTraits::conj(x_(j))); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); - } - }); + const XComponentType x_fixed(Kokkos::ArithTraits::conj(x_(j))); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); } } } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutRightTag, - const member_type& team) const { + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutRightTag, const member_type& team) const { // Condition 'alpha_ == zero' has already been checked const IndexType i(team.league_rank()); if (x_(i) == Kokkos::ArithTraits::zero()) { @@ -166,23 +143,17 @@ struct TeamParallelSYR { const IndexType N(A_.extent(1)); const XComponentType x_fixed(x_(i)); if constexpr (tJustTranspose) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); + } + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { - if (((tJustUp == true) && (i <= j)) || - ((tJustUp == false) && (i >= j))) { - A_(i, j) += AComponentType( - alpha_ * x_fixed * - Kokkos::ArithTraits::conj(x_(j))); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * Kokkos::ArithTraits::conj(x_(j))); + } + }); } } } @@ -194,13 +165,10 @@ struct TeamParallelSYR { }; // Team parallel version of SYR. -template -void teamParallelSyr(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { - static_assert(std::is_integral::value, - "IndexType must be an integer"); +template +void teamParallelSyr(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + static_assert(std::is_integral::value, "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -212,11 +180,9 @@ void teamParallelSyr(const ExecutionSpace& space, return; } - constexpr bool isLayoutLeft = - std::is_same_v; + constexpr bool isLayoutLeft = std::is_same_v; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -227,9 +193,7 @@ void teamParallelSyr(const ExecutionSpace& space, teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TeamParallelSYR - functor(alpha, x, A); + TeamParallelSYR functor(alpha, x, A); Kokkos::parallel_for("KokkosBlas::syr[teamParallel]", teamPolicy, functor); } @@ -241,26 +205,18 @@ void teamParallelSyr(const ExecutionSpace& space, // // The 'enable_if' makes sure unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalSyrImpl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { - threadParallelSyr(space, alpha, x, A); +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + threadParallelSyr(space, alpha, x, A); } -template ()>::type* = nullptr> -void generalSyrImpl(const ExecutionSpace& space, - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) { - teamParallelSyr(space, alpha, x, A); +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + teamParallelSyr(space, alpha, x, A); } } // namespace Impl diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index b07c3a1446..58c7753618 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -40,16 +40,14 @@ struct syr_eti_spec_avail { // specializations go in this header file. We may spread out definitions (see // _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct syr_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -65,20 +63,15 @@ namespace Impl { // Implementation of KokkosBlas::syr. template ::value, - bool eti_spec_avail = - syr_eti_spec_avail::value> + bool tpl_spec_avail = syr_tpl_spec_avail::value, + bool eti_spec_avail = syr_eti_spec_avail::value> struct SYR { - static void syr(const ExecutionSpace& space, const char trans[], - const char uplo[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const AViewType& A) + static void syr(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::syr[ETI]" - : "KokkosBlas::syr[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::syr[ETI]" + : "KokkosBlas::syr[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); @@ -88,41 +81,32 @@ struct SYR { bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); // Prefer int as the index type, but use a larsyr type if needed. - if ((numRows < static_cast(INT_MAX)) && - (numCols < static_cast(INT_MAX))) { + if ((numRows < static_cast(INT_MAX)) && (numCols < static_cast(INT_MAX))) { if (justTranspose) { if (justUp) { - generalSyrImpl( - space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } else { if (justUp) { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } } else { if (justTranspose) { if (justUp) { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } else { if (justUp) { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } else { - generalSyrImpl(space, alpha, x, A); + generalSyrImpl(space, alpha, x, A); } } } @@ -145,24 +129,20 @@ struct SYR { // We may spread out definitions (see _DEF macro below) across one or more .cpp // files. // -#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/blas/impl/KokkosBlas2_team_gemv_impl.hpp b/blas/impl/KokkosBlas2_team_gemv_impl.hpp index 5e43cae7d4..19e2bde931 100644 --- a/blas/impl/KokkosBlas2_team_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_team_gemv_impl.hpp @@ -26,51 +26,41 @@ namespace Impl { template struct TeamGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, + const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); // default OpA = OpID - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { - return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, - ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, ys0); } }; template struct TeamVectorGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, + const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); // default OpA = OpID - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, - const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, - const int xs0, const ScalarType beta, - /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { - return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, - ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, + const ValueXType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, ys0); } }; @@ -79,13 +69,12 @@ struct TeamVectorGemvInternal { /// ==================== template <> -template +template KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, - const ScalarType beta, + const MemberType &member, OpA op, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -102,29 +91,26 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( if (beta != one) member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m), - [&](const int &i) { - ValueYType t(0); - const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m), [&](const int &i) { + ValueYType t(0); + const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int j = 0; j < n; ++j) - t += op(tA[j * as1]) * x[j * xs0]; - y[i * ys0] += alpha * t; - }); + for (int j = 0; j < n; ++j) t += op(tA[j * as1]) * x[j * xs0]; + y[i * ys0] += alpha * t; + }); } return 0; } template <> -template +template KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, OpA /* op */, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, - const ScalarType beta, + const MemberType &member, OpA /* op */, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -149,13 +135,10 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( // Made this non-const in order to WORKAROUND issue #349 int mb = mb_a < mb_b ? mb_a : mb_b, mp = m % mb; - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, (m / mb) + (mp > 0)), - [&](const int &ii) { - const int i = ii * mb; - inner.serial_invoke(alpha, A + i * as0, x, - (i + mb) > m ? (m - i) : mb, - n, y + i * ys0); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, (m / mb) + (mp > 0)), [&](const int &ii) { + const int i = ii * mb; + inner.serial_invoke(alpha, A + i * as0, x, (i + mb) > m ? (m - i) : mb, n, y + i * ys0); + }); member.team_barrier(); } @@ -167,14 +150,12 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( /// ==================== template <> -template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemvInternal::invoke( - const MemberType &member, OpA op, const int m, const int n, - const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, - const ScalarType beta, +template +KOKKOS_INLINE_FUNCTION int TeamVectorGemvInternal::invoke( + const MemberType &member, OpA op, const int m, const int n, const ScalarType alpha, + const ValueAType *KOKKOS_RESTRICT A, const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { const ScalarType one(1.0), zero(0.0); @@ -196,12 +177,8 @@ TeamVectorGemvInternal::invoke( const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, n), - [&](const int &j, ValueYType &update) { - update += op(tA[j * as1]) * x[j * xs0]; - }, - t); - Kokkos::single(Kokkos::PerThread(member), - [&]() { y[i * ys0] += alpha * t; }); + [&](const int &j, ValueYType &update) { update += op(tA[j * as1]) * x[j * xs0]; }, t); + Kokkos::single(Kokkos::PerThread(member), [&]() { y[i * ys0] += alpha * t; }); }); } return 0; diff --git a/blas/impl/KokkosBlas2_team_gemv_spec.hpp b/blas/impl/KokkosBlas2_team_gemv_spec.hpp index d46fb7be6f..c3cf43b743 100644 --- a/blas/impl/KokkosBlas2_team_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_team_gemv_spec.hpp @@ -25,28 +25,19 @@ namespace KokkosBlas { -template +template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, - const ScalarType /*alpha*/, - const AViewType& /*A*/, - const xViewType& /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, const ScalarType /*alpha*/, + const AViewType& /*A*/, const xViewType& /*x*/, const ScalarType /*beta*/, const yViewType& /*y*/); }; template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, - const ScalarType /*alpha*/, - const AViewType& /*A*/, - const xViewType& /*x*/, - const ScalarType /*beta*/, + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, const ScalarType /*alpha*/, + const AViewType& /*A*/, const xViewType& /*x*/, const ScalarType /*beta*/, const yViewType& /*y*/); }; @@ -56,31 +47,25 @@ struct TeamVectorGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "KokkosBlas::TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "KokkosBlas::TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1(), x.data(), x.stride_0(), beta, + y.data(), y.stride_0()); } }; @@ -90,31 +75,25 @@ struct TeamGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), + beta, y.data(), y.stride_0()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, + y.data(), y.stride_0()); } }; @@ -124,33 +103,25 @@ struct TeamGemv { template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), - y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, Impl::OpConj{}, A.extent(1), A.extent(0), + alpha, A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; template struct TeamGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "BLAS TeamGemv requires rank-2 A matrix"); - return Impl::TeamGemvInternal::invoke( - member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), - y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke(member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; @@ -160,16 +131,13 @@ struct TeamGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "Batched TeamVectorGemv requires rank-2 A matrix"); - return Impl::TeamVectorGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); + return Impl::TeamVectorGemvInternal::invoke(member, A.extent(0), A.extent(1), alpha, + A.data(), A.stride_0(), A.stride_1(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; @@ -179,16 +147,13 @@ struct TeamVectorGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "Batched TeamVectorGemv requires rank-2 A matrix"); - return Impl::TeamVectorGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); + return Impl::TeamVectorGemvInternal::invoke(member, A.extent(1), A.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; @@ -198,17 +163,13 @@ struct TeamVectorGemv { template struct TeamVectorGemv { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const ScalarType alpha, const AViewType& A, - const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::rank == 2, - "Batched TeamVectorGemv requires rank-2 A matrix"); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( - member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), - A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), - y.stride_0()); + member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(), x.data(), + x.stride_0(), beta, y.data(), y.stride_0()); } }; diff --git a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 26c4c9624a..15c3c74ecd 100644 --- a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -57,8 +57,7 @@ struct DotBasedGEMM { const size_A dotSize; // the length of the vectors in the dot products - DotBasedGEMM(const scalar_A& alpha_, const AV& A_, const BV& B_, - const scalar_C& beta_, const CV& C_) + DotBasedGEMM(const scalar_A& alpha_, const AV& A_, const BV& B_, const scalar_C& beta_, const CV& C_) : A(A_), B(B_), C(C_), @@ -69,52 +68,39 @@ struct DotBasedGEMM { dotSize(A.extent(0)) {} void run(const ExecSpace& space, bool conjugateTranspose) { - multipleReductionWorkDistribution( - dotSize, numCrows * numCcols, numDivPerDot); + multipleReductionWorkDistribution(dotSize, numCrows * numCcols, numDivPerDot); const size_C ndots = numCrows * numCcols; // Number of dot products numTeams = ndots * numDivPerDot; // Initialize C matrix if beta != 1 if (beta == CVT::zero()) { - Kokkos::MDRangePolicy> policyInit( - space, {0, 0}, {numCrows, numCcols}); - Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", - policyInit, *this); + Kokkos::MDRangePolicy> policyInit(space, {0, 0}, {numCrows, numCcols}); + Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } else if (beta != CVT::one()) { - Kokkos::MDRangePolicy> policyInit( - space, {0, 0}, {numCrows, numCcols}); - Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", - policyInit, *this); + Kokkos::MDRangePolicy> policyInit(space, {0, 0}, {numCrows, numCcols}); + Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } // Multiply alpha*A^TB and add it to beta*C if (conjugateTranspose) { - Kokkos::TeamPolicy policyMult(space, numTeams, - Kokkos::AUTO); + Kokkos::TeamPolicy policyMult(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this); } else { - Kokkos::TeamPolicy policyMult(space, numTeams, - Kokkos::AUTO); + Kokkos::TeamPolicy policyMult(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this); } } KOKKOS_INLINE_FUNCTION - void operator()(const TagZero&, const size_C& rowId, - const size_C& colId) const { - C(rowId, colId) = CVT::zero(); - } + void operator()(const TagZero&, const size_C& rowId, const size_C& colId) const { C(rowId, colId) = CVT::zero(); } KOKKOS_INLINE_FUNCTION - void operator()(const TagInit&, const size_C& rowId, - const size_C& colId) const { + void operator()(const TagInit&, const size_C& rowId, const size_C& colId) const { C(rowId, colId) = beta * C(rowId, colId); } KOKKOS_INLINE_FUNCTION - void operator()(const TagMult&, - const typename Kokkos::TeamPolicy::member_type& - teamMember) const { + void operator()(const TagMult&, const typename Kokkos::TeamPolicy::member_type& teamMember) const { const size_C globalRank = teamMember.league_rank(); const size_C localRank = globalRank % numDivPerDot; const size_C i = globalRank / numDivPerDot; @@ -127,19 +113,13 @@ struct DotBasedGEMM { if (localRank == numDivPerDot - 1) end = dotSize; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(teamMember, begin, end), - [&](const size_A k, scalar_C& update) { - update += alpha * A(k, rowId) * B(k, colId); - }, - result); + [&](const size_A k, scalar_C& update) { update += alpha * A(k, rowId) * B(k, colId); }, result); - Kokkos::single(Kokkos::PerTeam(teamMember), - [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); } KOKKOS_INLINE_FUNCTION - void operator()(const TagMultCT&, - const typename Kokkos::TeamPolicy::member_type& - teamMember) const { + void operator()(const TagMultCT&, const typename Kokkos::TeamPolicy::member_type& teamMember) const { const size_C globalRank = teamMember.league_rank(); const size_C localRank = globalRank % numDivPerDot; const size_C i = globalRank / numDivPerDot; @@ -152,13 +132,9 @@ struct DotBasedGEMM { if (localRank == numDivPerDot - 1) end = dotSize; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(teamMember, begin, end), - [&](const size_A k, scalar_C& update) { - update += alpha * AVT::conj(A(k, rowId)) * B(k, colId); - }, - result); + [&](const size_A k, scalar_C& update) { update += alpha * AVT::conj(A(k, rowId)) * B(k, colId); }, result); - Kokkos::single(Kokkos::PerTeam(teamMember), - [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { Kokkos::atomic_add(&C(rowId, colId), result); }); } }; diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index 1a0ab46bb3..675ef5d3a4 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -55,419 +55,320 @@ struct impl_gemm_choose_copy_layout { #endif // DeepCopy matrix block into scratch -template +template struct impl_deep_copy_matrix_block; -template -struct impl_deep_copy_matrix_block { +template +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - const int idx_i = offset_i + i; - A_scr(i, j) = A(idx_i, idx_j); - }); - }); + const int idx_i = offset_i + i; + A_scr(i, j) = A(idx_i, idx_j); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; + int idx_j = offset_j + j; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; -#endif - const int idx_i = offset_i + i; - A_scr(i, j) = - idx_i < A.extent_int(0) && idx_j < A.extent_int(1) - ? A(idx_i, idx_j) - : ATV::zero(); - }); - }); + int idx_j = offset_j + j; +#endif + const int idx_i = offset_i + i; + A_scr(i, j) = idx_i < A.extent_int(0) && idx_j < A.extent_int(1) ? A(idx_i, idx_j) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { - const int idx_i = offset_i + i; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { - const int idx_j = offset_j + j; - A_scr(i, j) = A(idx_i, idx_j); - }); - }); + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + A_scr(i, j) = A(idx_i, idx_j); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; + int idx_i = offset_i + i; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; -#endif - const int idx_j = offset_j + j; - A_scr(i, j) = - idx_i < A.extent_int(0) && idx_j < A.extent_int(1) - ? A(idx_i, idx_j) - : ATV::zero(); - }); - }); + int idx_i = offset_i + i; +#endif + const int idx_j = offset_j + j; + A_scr(i, j) = idx_i < A.extent_int(0) && idx_j < A.extent_int(1) ? A(idx_i, idx_j) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block { +template +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - const int idx_i = offset_i + i; - A_scr(i, j) = A(idx_j, idx_i); - }); - }); + const int idx_i = offset_i + i; + A_scr(i, j) = A(idx_j, idx_i); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; + int idx_j = offset_j + j; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; -#endif - const int idx_i = offset_i + i; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? A(idx_j, idx_i) - : ATV::zero(); - }); - }); + int idx_j = offset_j + j; +#endif + const int idx_i = offset_i + i; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? A(idx_j, idx_i) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - const int idx_j = offset_j + j; - A_scr(i, j) = A(idx_j, idx_i); - }); - }); + const int idx_j = offset_j + j; + A_scr(i, j) = A(idx_j, idx_i); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; + int idx_i = offset_i + i; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; -#endif - const int idx_j = offset_j + j; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? A(idx_j, idx_i) - : ATV::zero(); - }); - }); + int idx_i = offset_i + i; +#endif + const int idx_j = offset_j + j; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? A(idx_j, idx_i) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block { +template +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_j = offset_j + j; + const int idx_j = offset_j + j; #endif - const int idx_i = offset_i + i; - A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); - }); - }); + const int idx_i = offset_i + i; + A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; + int idx_j = offset_j + j; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_j = offset_j + j; -#endif - const int idx_i = offset_i + i; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? ATV::conj(A(idx_j, idx_i)) - : ATV::zero(); - }); - }); + int idx_j = offset_j + j; +#endif + const int idx_i = offset_i + i; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? ATV::conj(A(idx_j, idx_i)) : ATV::zero(); + }); + }); } } }; -template -struct impl_deep_copy_matrix_block +struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, - const ViewType& A, const int& offset_i, + static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(1) && - offset_j + blockDim_j <= A.extent_int(0)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + if (offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - const int idx_i = offset_i + i; + const int idx_i = offset_i + i; #endif - const int idx_j = offset_j + j; - A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); - }); - }); + const int idx_j = offset_j + j; + A_scr(i, j) = ATV::conj(A(idx_j, idx_i)); + }); + }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; + int idx_i = offset_i + i; #endif - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND - int idx_i = offset_i + i; -#endif - const int idx_j = offset_j + j; - A_scr(i, j) = - idx_i < A.extent_int(1) && idx_j < A.extent_int(0) - ? ATV::conj(A(idx_j, idx_i)) - : ATV::zero(); - }); - }); + int idx_i = offset_i + i; +#endif + const int idx_j = offset_j + j; + A_scr(i, j) = idx_i < A.extent_int(1) && idx_j < A.extent_int(0) ? ATV::conj(A(idx_j, idx_i)) : ATV::zero(); + }); + }); } } }; -template +template struct impl_update_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void update(const TeamHandle& team, const value_type& beta, - const ViewType& A, const value_type& alpha, - const ViewTypeScratch& A_scr, const int& offset_i, - const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { - const int idx_j = offset_j + j; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), - [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = beta * A(idx_i, idx_j) + - alpha * A_scr(i, j); - }); - } + static void update(const TeamHandle& team, const value_type& beta, const ViewType& A, const value_type& alpha, + const ViewTypeScratch& A_scr, const int& offset_i, const int& offset_j) { + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = alpha * A_scr(i, j); + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); }); + } + }); } else { - const int range_i = offset_i + blockDim_i <= A.extent_int(0) - ? blockDim_i - : A.extent_int(0) % blockDim_i; - const int range_j = offset_j + blockDim_j <= A.extent_int(1) - ? blockDim_j - : A.extent_int(1) % blockDim_j; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, range_j), [&](const int j) { - const int idx_j = offset_j + j; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_i), - [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, range_i), [&](const int i) { - const int idx_i = offset_i + i; - A(idx_i, idx_j) = - beta * A(idx_i, idx_j) + alpha * A_scr(i, j); - }); - } + const int range_i = offset_i + blockDim_i <= A.extent_int(0) ? blockDim_i : A.extent_int(0) % blockDim_i; + const int range_j = offset_j + blockDim_j <= A.extent_int(1) ? blockDim_j : A.extent_int(1) % blockDim_j; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, range_j), [&](const int j) { + const int idx_j = offset_j + j; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = alpha * A_scr(i, j); }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_i), [&](const int i) { + const int idx_i = offset_i + i; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); + }); + } + }); } } }; -template -struct impl_update_matrix_block { +template +struct impl_update_matrix_block { typedef typename ViewType::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION - static void update(const TeamHandle& team, const value_type& beta, - const ViewType& A, const value_type& alpha, - const ViewTypeScratch& A_scr, const int& offset_i, - const int& offset_j) { - if (offset_i + blockDim_i <= A.extent_int(0) && - offset_j + blockDim_j <= A.extent_int(1)) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { - const int idx_i = offset_i + i; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), - [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = beta * A(idx_i, idx_j) + - alpha * A_scr(i, j); - }); - } + static void update(const TeamHandle& team, const value_type& beta, const ViewType& A, const value_type& alpha, + const ViewTypeScratch& A_scr, const int& offset_i, const int& offset_j) { + if (offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) { + const int idx_i = offset_i + i; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = alpha * A_scr(i, j); + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); }); + } + }); } else { - const int range_i = offset_i + blockDim_i <= A.extent_int(0) - ? blockDim_i - : A.extent_int(0) % blockDim_i; - const int range_j = offset_j + blockDim_j <= A.extent_int(1) - ? blockDim_j - : A.extent_int(1) % blockDim_j; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, range_i), [&](const int i) { - const int idx_i = offset_i + i; - if (beta == ATV::zero()) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_j), - [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = alpha * A_scr(i, j); - }); - } else { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team, range_j), [&](const int j) { - const int idx_j = offset_j + j; - A(idx_i, idx_j) = - beta * A(idx_i, idx_j) + alpha * A_scr(i, j); - }); - } + const int range_i = offset_i + blockDim_i <= A.extent_int(0) ? blockDim_i : A.extent_int(0) % blockDim_i; + const int range_j = offset_j + blockDim_j <= A.extent_int(1) ? blockDim_j : A.extent_int(1) % blockDim_j; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, range_i), [&](const int i) { + const int idx_i = offset_i + i; + if (beta == ATV::zero()) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = alpha * A_scr(i, j); + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_j), [&](const int j) { + const int idx_j = offset_j + j; + A(idx_i, idx_j) = beta * A(idx_i, idx_j) + alpha * A_scr(i, j); }); + } + }); } } }; @@ -475,14 +376,11 @@ struct impl_update_matrix_block -KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, - const ViewTypeC& C, - const ViewTypeA& A, +KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) { typedef typename ViewTypeC::non_const_value_type ScalarC; // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && \ - (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__)) +#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__)) int blockA0 = A.extent_int(0); int blockA1 = A.extent_int(1); int blockB1 = B.extent_int(1); @@ -491,36 +389,34 @@ KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, const int blockA1 = A.extent_int(1); const int blockB1 = B.extent_int(1); #endif - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { #ifndef KOKKOSKERNELS_ENABLE_OMP_SIMD - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1 / 4), - [&](const int B_j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1 / 4), [&](const int B_j) { #else #pragma omp simd for(int B_j=0; B_j @@ -565,8 +461,8 @@ struct impl_gemm_label<2, 2> { static constexpr const char* label = "KokkosBlas::gemm[CC]"; }; -template +template struct GEMMImpl { ViewTypeA A; ViewTypeB B; @@ -580,18 +476,14 @@ struct GEMMImpl { int scratch_level; ScalarC alpha, beta; - typedef Kokkos::View + typedef Kokkos::View ViewTypeAScratch; - typedef Kokkos::View + typedef Kokkos::View ViewTypeBScratch; - typedef Kokkos::View + typedef Kokkos::View ViewTypeCScratch; - GEMMImpl(const ScalarA& alpha_, const ViewTypeA& A_, const ViewTypeB& B_, - const ScalarC& beta_, const ViewTypeC& C_) + GEMMImpl(const ScalarA& alpha_, const ViewTypeA& A_, const ViewTypeB& B_, const ScalarC& beta_, const ViewTypeC& C_) : A(A_), B(B_), C(C_), @@ -602,12 +494,10 @@ struct GEMMImpl { beta = beta_; } - void run(const ExecSpace& space, int team_size, int vector_length, - int scr_level) { - scratch_level = scr_level; - int scratch_memory_size = ViewTypeAScratch::shmem_size() + - ViewTypeBScratch::shmem_size() + - ViewTypeCScratch::shmem_size(); + void run(const ExecSpace& space, int team_size, int vector_length, int scr_level) { + scratch_level = scr_level; + int scratch_memory_size = + ViewTypeAScratch::shmem_size() + ViewTypeBScratch::shmem_size() + ViewTypeCScratch::shmem_size(); #if defined(KOKKOS_ENABLE_HIP) // Note lbv, 10/29/20: The LaunchBounds<384, 2> leads @@ -616,23 +506,19 @@ struct GEMMImpl { // are allocated... Switching to LaunchBounds<384, 0> fixes // that problem but I'm not sure if that it a good perf // parameter or why it is set to 2 for Cuda? - Kokkos::TeamPolicy> policy( - space, num_blocks_0 * num_blocks_1, team_size, vector_length); + Kokkos::TeamPolicy> policy(space, num_blocks_0 * num_blocks_1, team_size, + vector_length); #else - Kokkos::TeamPolicy> policy( - space, num_blocks_0 * num_blocks_1, team_size, vector_length); + Kokkos::TeamPolicy> policy(space, num_blocks_0 * num_blocks_1, team_size, + vector_length); #endif - Kokkos::parallel_for( - impl_gemm_label::label, - policy.set_scratch_size(scratch_level, - Kokkos::PerTeam(scratch_memory_size)), - *this); + Kokkos::parallel_for(impl_gemm_label::label, + policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_memory_size)), *this); } KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // This team is responsible for computing a single block of C const int league_rank = team.league_rank(); const int num_blocks = num_blocks_1; @@ -642,11 +528,9 @@ struct GEMMImpl { ViewTypeAScratch A_scr(team.team_scratch(scratch_level)); ViewTypeBScratch B_scr(team.team_scratch(scratch_level)); ViewTypeCScratch C_scr(team.team_scratch(scratch_level)); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1), - [&](const int j) { C_scr(i, j) = 0; }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, blockA0), [&](const int i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1), [&](const int j) { C_scr(i, j) = 0; }); + }); team.team_barrier(); // Move along the inner dimension in blocks @@ -654,22 +538,16 @@ struct GEMMImpl { for (int A_j = 0; A_j < length; A_j += blockA1) { // Load A block into scratch - impl_deep_copy_matrix_block< - typename Kokkos::TeamPolicy::member_type, ViewTypeAScratch, - ViewTypeA, - typename impl_gemm_choose_copy_layout< - ExecSpace, typename ViewTypeA::array_layout, - typename ViewTypeAScratch::array_layout>::type, - blockA0, blockA1, TransposeA>::copy(team, A_scr, A, i_offset, A_j); + impl_deep_copy_matrix_block::member_type, ViewTypeAScratch, ViewTypeA, + typename impl_gemm_choose_copy_layout::type, + blockA0, blockA1, TransposeA>::copy(team, A_scr, A, i_offset, A_j); // Load B block into scratch - impl_deep_copy_matrix_block< - typename Kokkos::TeamPolicy::member_type, ViewTypeBScratch, - ViewTypeB, - typename impl_gemm_choose_copy_layout< - ExecSpace, typename ViewTypeB::array_layout, - typename ViewTypeBScratch::array_layout>::type, - blockA1, blockB1, TransposeB>::copy(team, B_scr, B, A_j, j_offset); + impl_deep_copy_matrix_block::member_type, ViewTypeBScratch, ViewTypeB, + typename impl_gemm_choose_copy_layout::type, + blockA1, blockB1, TransposeB>::copy(team, B_scr, B, A_j, j_offset); // Wait for A and B block to be in scratch memory team.team_barrier(); @@ -682,10 +560,9 @@ struct GEMMImpl { team.team_barrier(); } // Write back the C block from scratch to main memory - impl_update_matrix_block< - typename Kokkos::TeamPolicy::member_type, ViewTypeC, - ViewTypeCScratch, typename ViewTypeC::array_layout, blockA0, - blockB1>::update(team, beta, C, alpha, C_scr, i_offset, j_offset); + impl_update_matrix_block::member_type, ViewTypeC, ViewTypeCScratch, + typename ViewTypeC::array_layout, blockA0, blockB1>::update(team, beta, C, alpha, C_scr, + i_offset, j_offset); } }; diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index 367a8dad3f..f085b5fc92 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -43,35 +43,27 @@ struct gemm_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct gemm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct gemm_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) // Include the actual specialization declarations #include @@ -85,37 +77,24 @@ namespace Impl { // // Implementation of KokkosBlas::gemm. -template < - class execution_space, class AViewType, class BViewType, class CViewType, - bool tpl_spec_avail = gemm_tpl_spec_avail::value, - bool eti_spec_avail = gemm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = gemm_eti_spec_avail::value> struct GEMM { - static void gemm(const execution_space& space, const char transA[], - const char transB[], - typename AViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B, - typename CViewType::const_value_type& beta, - const CViewType& C) + static void gemm(const execution_space& space, const char transA[], const char transB[], + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, + typename CViewType::const_value_type& beta, const CViewType& C) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); - static_assert(static_cast(CViewType::rank) == 2, - "CViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "CViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); + static_assert(static_cast(CViewType::rank) == 2, "CViewType must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::gemm[ETI]" - : "KokkosBlas::gemm[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::gemm[ETI]" + : "KokkosBlas::gemm[noETI]"); // Figure out Scalar Types typedef typename AViewType::non_const_value_type ScalarA; typedef typename BViewType::non_const_value_type ScalarB; @@ -125,29 +104,22 @@ struct GEMM { const int M = static_cast(C.extent(0)); const int N = static_cast(C.extent(1)); - const bool is_device_space = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - const bool A_is_lr = std::is_same::value; - const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || - (transA[0] == 'C') || (transA[0] == 'c')); - const bool B_is_tr = ((transB[0] == 'T') || (transB[0] == 't') || - (transB[0] == 'C') || (transB[0] == 'c')); + const bool is_device_space = KokkosKernels::Impl::kk_is_gpu_exec_space(); + const bool A_is_lr = std::is_same::value; + const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || (transA[0] == 'C') || (transA[0] == 'c')); + const bool B_is_tr = ((transB[0] == 'T') || (transB[0] == 't') || (transB[0] == 'C') || (transB[0] == 'c')); // NOTE: these thresholds were copied from TPL CUBLAS, and may need to be // retuned constexpr int numDotsLayoutLeftThreshold = 1600; constexpr int numDotsLayoutRightThreshold = 100; - if (((!A_is_lr && A_is_tr && !B_is_tr && - M * N < numDotsLayoutLeftThreshold) || - (A_is_lr && A_is_tr && !B_is_tr && - M * N < numDotsLayoutRightThreshold)) && + if (((!A_is_lr && A_is_tr && !B_is_tr && M * N < numDotsLayoutLeftThreshold) || + (A_is_lr && A_is_tr && !B_is_tr && M * N < numDotsLayoutRightThreshold)) && is_device_space) { // call dot-based GEMM, only for C := beta * C + alpha * A^T * B, on // device bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c')); - DotBasedGEMM - dotBasedGemm(alpha, A, B, beta, C); + DotBasedGEMM dotBasedGemm(alpha, A, B, beta, C); dotBasedGemm.run(space, A_is_conj); } else { @@ -155,116 +127,87 @@ struct GEMM { static constexpr int blockA0 = 24; static constexpr int blockB1 = 64; static constexpr int blockA1 = - (sizeof(ScalarA) * blockA0 * 16 + sizeof(ScalarB) * 16 * blockB1 + - sizeof(ScalarC) * blockA0 * blockB1 < + (sizeof(ScalarA) * blockA0 * 16 + sizeof(ScalarB) * 16 * blockB1 + sizeof(ScalarC) * blockA0 * blockB1 < 24000) ? 16 - : (sizeof(ScalarA) * blockA0 * 8 + sizeof(ScalarB) * 8 * blockB1 + - sizeof(ScalarC) * blockA0 * blockB1 < - 24000) - ? 8 - : (sizeof(ScalarA) * blockA0 * 4 + - sizeof(ScalarB) * 4 * blockB1 + - sizeof(ScalarC) * blockA0 * blockB1 < - 24000) - ? 4 - : 16; - int vector_length = blockB1 / 4; - int max_vector_length = - KokkosKernels::Impl::kk_get_max_vector_size(); + : (sizeof(ScalarA) * blockA0 * 8 + sizeof(ScalarB) * 8 * blockB1 + sizeof(ScalarC) * blockA0 * blockB1 < + 24000) + ? 8 + : (sizeof(ScalarA) * blockA0 * 4 + sizeof(ScalarB) * 4 * blockB1 + sizeof(ScalarC) * blockA0 * blockB1 < + 24000) + ? 4 + : 16; + int vector_length = blockB1 / 4; + int max_vector_length = KokkosKernels::Impl::kk_get_max_vector_size(); if (vector_length > max_vector_length) vector_length = max_vector_length; // Compute scratch space size - typedef KokkosBlas::Impl::GEMMImpl + typedef KokkosBlas::Impl::GEMMImpl gemm_dummy_type; - const int scratch_memory_size = - gemm_dummy_type::ViewTypeAScratch::required_allocation_size() + - gemm_dummy_type::ViewTypeBScratch::required_allocation_size() + - gemm_dummy_type::ViewTypeCScratch::required_allocation_size(); + const int scratch_memory_size = gemm_dummy_type::ViewTypeAScratch::required_allocation_size() + + gemm_dummy_type::ViewTypeBScratch::required_allocation_size() + + gemm_dummy_type::ViewTypeCScratch::required_allocation_size(); const int scratch_level = scratch_memory_size < 24000 ? 0 : 1; // Figure out Team Sizes int team_size = 1; #if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_SYCL) - if (std::is_same::value) - team_size = blockA0; + if (std::is_same::value) team_size = blockA0; #endif // Call the correct kernel - if ((transA[0] == 'N' || transA[0] == 'n') && - (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'N' || transB[0] == 'n')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'T' || transA[0] == 't') && - (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'N' || transB[0] == 'n')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'C' || transA[0] == 'c') && - (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'N' || transB[0] == 'n')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'N' || transA[0] == 'n') && - (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'T' || transB[0] == 't')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'T' || transA[0] == 't') && - (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'T' || transB[0] == 't')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'C' || transA[0] == 'c') && - (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'T' || transB[0] == 't')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'N' || transA[0] == 'n') && - (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'C' || transB[0] == 'c')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'T' || transA[0] == 't') && - (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'C' || transB[0] == 'c')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } - if ((transA[0] == 'C' || transA[0] == 'c') && - (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'C' || transB[0] == 'c')) { + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } @@ -288,61 +231,45 @@ struct GEMM { // one or more .cpp files. // -#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, EXEC_SPACE, MEM_SPACE) \ - extern template struct GEMM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE) \ + extern template struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; -#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, EXEC_SPACE, MEM_SPACE) \ - template struct GEMM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE) \ + template struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; -#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) -#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutLeft, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, \ - Kokkos::LayoutRight, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) +#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) #include diff --git a/blas/impl/KokkosBlas3_trmm_impl.hpp b/blas/impl/KokkosBlas3_trmm_impl.hpp index a183675889..8a1e9a7a4a 100644 --- a/blas/impl/KokkosBlas3_trmm_impl.hpp +++ b/blas/impl/KokkosBlas3_trmm_impl.hpp @@ -32,10 +32,8 @@ namespace KokkosBlas { namespace Impl { template -void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], - const char /*diag*/[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], const char /*diag*/[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { using KokkosBatched::Algo; using KokkosBatched::Diag; using KokkosBatched::SerialTrmmInternalLeftLower; @@ -43,8 +41,7 @@ void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], using KokkosBatched::SerialTrmmInternalRightLower; using KokkosBatched::SerialTrmmInternalRightUpper; - char __side = tolower(side[0]), __uplo = tolower(uplo[0]), - __trans = tolower(trans[0]); + char __side = tolower(side[0]), __uplo = tolower(uplo[0]), __trans = tolower(trans[0]); //__diag = tolower(diag[0]); bool do_conj = true; @@ -53,79 +50,67 @@ void SerialTrmm_Invoke(const char side[], const char uplo[], const char trans[], //// Lower non-transpose //// if (__side == 'l' && __uplo == 'l' && __trans == 'n') SerialTrmmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'l' && __trans == 'n') SerialTrmmInternalRightLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); //// Lower transpose ///// // Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'l' && __trans == 't') SerialTrmmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'l' && __trans == 't') SerialTrmmInternalRightUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// Lower conjugate-transpose //// // Conjugate-Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'l' && __trans == 'c') SerialTrmmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'l' && __trans == 'c') SerialTrmmInternalRightUpper::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// Upper non-transpose //// if (__side == 'l' && __uplo == 'u' && __trans == 'n') SerialTrmmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'u' && __trans == 'n') SerialTrmmInternalRightUpper::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(0), A.extent(1), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); //// Upper transpose // Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'u' && __trans == 't') SerialTrmmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'u' && __trans == 't') SerialTrmmInternalRightLower::invoke( - Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, !do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// Upper conjugate-transpose //// // Conjugate-Transpose A by simply swapping the dimensions (extent) and stride // parameters if (__side == 'l' && __uplo == 'u' && __trans == 'c') SerialTrmmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); if (__side == 'r' && __uplo == 'u' && __trans == 'c') SerialTrmmInternalRightLower::invoke( - Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), - B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), - B.data(), B.stride(0), B.stride(1)); + Diag::Unit::use_unit_diag, do_conj, A.extent(1), A.extent(0), B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); } } // namespace Impl } // namespace KokkosBlas diff --git a/blas/impl/KokkosBlas3_trmm_spec.hpp b/blas/impl/KokkosBlas3_trmm_spec.hpp index 85a8b1c6db..6399f9e57e 100644 --- a/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -36,25 +36,21 @@ struct trmm_eti_spec_avail { // // This Macro is for readability of the template arguments. // -#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct trmm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct trmm_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // // This Macros provides the ETI specialization of trmm // #define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) // Include the actual specialization declarations #include @@ -69,33 +65,25 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - trmm_eti_spec_avail::value> + bool tpl_spec_avail = trmm_tpl_spec_avail::value, + bool eti_spec_avail = trmm_eti_spec_avail::value> struct TRMM { - static void trmm(const execution_space& space, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BVIT::const_value_type& alpha, const AVIT& A, - const BVIT& B); + static void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], + const char diag[], typename BVIT::const_value_type& alpha, const AVIT& A, const BVIT& B); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct TRMM { - static void trmm(const execution_space& /*space*/, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BVIT::const_value_type& alpha, const AVIT& A, - const BVIT& B) { +struct TRMM { + static void trmm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], + const char diag[], typename BVIT::const_value_type& alpha, const AVIT& A, const BVIT& B) { static_assert(Kokkos::is_view::value, "AVIT must be a Kokkos::View."); static_assert(Kokkos::is_view::value, "BVIT must be a Kokkos::View."); static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); static_assert(static_cast(BVIT::rank) == 2, "BVIT must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::trmm[ETI]" - : "KokkosBlas::trmm[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::trmm[ETI]" + : "KokkosBlas::trmm[noETI]"); typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A); typename BVIT::HostMirror host_B = Kokkos::create_mirror_view(B); @@ -105,8 +93,8 @@ struct TRMM( - side, uplo, trans, diag, alpha, host_A, host_B); + SerialTrmm_Invoke(side, uplo, trans, diag, alpha, host_A, + host_B); // Copy host_B to B // no-op if B's MemorySpace is HostSpace @@ -124,27 +112,21 @@ struct TRMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; - -#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template struct TRMM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + extern template struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; // // These Macros are only included when we are not compiling libkokkoskernels but @@ -154,12 +136,10 @@ struct TRMM diff --git a/blas/impl/KokkosBlas3_trsm_impl.hpp b/blas/impl/KokkosBlas3_trsm_impl.hpp index 87cac8b86a..57c1342eb5 100644 --- a/blas/impl/KokkosBlas3_trsm_impl.hpp +++ b/blas/impl/KokkosBlas3_trsm_impl.hpp @@ -34,12 +34,9 @@ namespace KokkosBlas { namespace Impl { template -int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType* KOKKOS_RESTRICT B, - const int bs0, const int bs1) { +int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); @@ -47,8 +44,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, if (alpha == zero) SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -56,8 +52,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const ValueType* KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + p * as1; - ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, - *KOKKOS_RESTRICT B2 = B + (p + 1) * bs0; + ValueType *KOKKOS_RESTRICT b1t = B + p * bs0, *KOKKOS_RESTRICT B2 = B + (p + 1) * bs0; if (!use_unit_diag) { const ValueType alpha11 = AT::conj(A[p * as0 + p * as1]); @@ -65,20 +60,16 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, } for (int i = 0; i < iend; ++i) - for (int j = 0; j < jend; ++j) - B2[i * bs0 + j * bs1] -= AT::conj(a21[i * as0]) * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B2[i * bs0 + j * bs1] -= AT::conj(a21[i * as0]) * b1t[j * bs1]; } } return 0; } template -int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, - const int n, const ScalarType alpha, - const ValueType* KOKKOS_RESTRICT A, - const int as0, const int as1, - /**/ ValueType* KOKKOS_RESTRICT B, - const int bs0, const int bs1) { +int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, const int n, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT A, const int as0, const int as1, + /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); @@ -86,8 +77,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, if (alpha == zero) SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) - KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType* KOKKOS_RESTRICT B0 = B; @@ -105,8 +95,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, if (p > 0) { // Note: A workaround to produce correct results for // complex with Intel-18.2.199 for (int i = 0; i < iend; ++i) - for (int j = 0; j < jend; ++j) - B0[i * bs0 + j * bs1] -= AT::conj(a01[i * as0]) * b1t[j * bs1]; + for (int j = 0; j < jend; ++j) B0[i * bs0 + j * bs1] -= AT::conj(a01[i * as0]) * b1t[j * bs1]; } } } @@ -114,204 +103,146 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, } template -void SerialTrsm_Invoke(const char side[], const char uplo[], const char trans[], - const char diag[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +void SerialTrsm_Invoke(const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { using KokkosBatched::Algo; using KokkosBatched::Diag; // Side::Left, Uplo::Lower, Trans::NoTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Lower, Trans::Transpose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Lower, Trans::ConjTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftUpperConj( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftUpperConj( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftUpperConj(Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), + A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftUpperConj(Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); // Side::Left, Uplo::Upper, Trans::NoTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Upper, Trans::Transpose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(0), B.stride(1)); // Side::Left, Uplo::Upper, Trans::ConjTranspose - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftLowerConj( - Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); - if (((side[0] == 'L') || (side[0] == 'l')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftLowerConj( - Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftLowerConj(Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), A.stride(1), + A.stride(0), B.data(), B.stride(0), B.stride(1)); + if (((side[0] == 'L') || (side[0] == 'l')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftLowerConj(Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(), + A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1)); //// // Side::Right, Uplo::Lower, Trans::NoTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Lower, Trans::Transpose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Lower, Trans::ConjTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftLowerConj( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'L') || (uplo[0] == 'l')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftLowerConj( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftLowerConj(Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), + A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'L') || (uplo[0] == 'l')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftLowerConj(Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); // Side::Right, Uplo::Upper, Trans::NoTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'N') || (trans[0] == 'n')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'N') || (trans[0] == 'n')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftLower::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(1), A.stride(0), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Upper, Trans::Transpose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'U') || (diag[0] == 'u'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'T') || (trans[0] == 't')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) + Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'T') || (trans[0] == 't')) && ((diag[0] == 'N') || (diag[0] == 'n'))) KokkosBatched::SerialTrsmInternalLeftUpper::invoke( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), A.stride(1), B.data(), + B.stride(1), B.stride(0)); // Side::Right, Uplo::Upper, Trans::ConjTranspose - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'U') || (diag[0] == 'u'))) - SerialTrsmInternalLeftUpperConj( - Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); - if (((side[0] == 'R') || (side[0] == 'r')) && - ((uplo[0] == 'U') || (uplo[0] == 'u')) && - ((trans[0] == 'C') || (trans[0] == 'c')) && - ((diag[0] == 'N') || (diag[0] == 'n'))) - SerialTrsmInternalLeftUpperConj( - Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), - A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'U') || (diag[0] == 'u'))) + SerialTrsmInternalLeftUpperConj(Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride(0), + A.stride(1), B.data(), B.stride(1), B.stride(0)); + if (((side[0] == 'R') || (side[0] == 'r')) && ((uplo[0] == 'U') || (uplo[0] == 'u')) && + ((trans[0] == 'C') || (trans[0] == 'c')) && ((diag[0] == 'N') || (diag[0] == 'n'))) + SerialTrsmInternalLeftUpperConj(Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0)); } } // namespace Impl diff --git a/blas/impl/KokkosBlas3_trsm_spec.hpp b/blas/impl/KokkosBlas3_trsm_spec.hpp index 93d01ed53b..8c9088e970 100644 --- a/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -42,22 +42,18 @@ struct trsm_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct trsm_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct trsm_eti_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) // Include the actual specialization declarations #include @@ -72,38 +68,28 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = - trsm_eti_spec_avail::value> + bool tpl_spec_avail = trsm_tpl_spec_avail::value, + bool eti_spec_avail = trsm_eti_spec_avail::value> struct TRSM { - static void trsm(const execution_space& space, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B); + static void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B); }; // Implementation of KokkosBlas::trsm. #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct TRSM { - static void trsm(const execution_space& /*space*/, const char side[], - const char uplo[], const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); - - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::trsm[ETI]" - : "KokkosBlas::trsm[noETI]"); +struct TRSM { + static void trsm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); + + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::trsm[ETI]" + : "KokkosBlas::trsm[noETI]"); typename AViewType::HostMirror h_A = Kokkos::create_mirror_view(A); typename BViewType::HostMirror h_B = Kokkos::create_mirror_view(B); @@ -111,9 +97,8 @@ struct TRSM(side, uplo, trans, diag, - alpha, h_A, h_B); + SerialTrsm_Invoke(side, uplo, trans, diag, alpha, + h_A, h_B); Kokkos::deep_copy(B, h_B); @@ -134,35 +119,27 @@ struct TRSM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; - -#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ - EXEC_SPACE, MEM_SPACE) \ - template struct TRSM< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - false, true>; +#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + extern template struct TRSM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE) \ + template struct TRSM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; #define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) #define KOKKOSBLAS3_TRSM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) + KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) #include diff --git a/blas/impl/KokkosBlas_serial_axpy.hpp b/blas/impl/KokkosBlas_serial_axpy.hpp index 344632b8eb..83bb2b9c98 100644 --- a/blas/impl/KokkosBlas_serial_axpy.hpp +++ b/blas/impl/KokkosBlas_serial_axpy.hpp @@ -26,9 +26,8 @@ namespace Impl { /// Serial Internal Impl /// ==================== template -KOKKOS_INLINE_FUNCTION static void serial_axpy( - const int m, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT X, - /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int ys0) { +KOKKOS_INLINE_FUNCTION static void serial_axpy(const int m, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int ys0) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -38,17 +37,14 @@ KOKKOS_INLINE_FUNCTION static void serial_axpy( } template -KOKKOS_INLINE_FUNCTION static void serial_axpy_mv( - const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT X, - /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int xs1, - const int ys0, const int ys1) { +KOKKOS_INLINE_FUNCTION static void serial_axpy_mv(const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int xs1, + const int ys0, const int ys1) { if (xs0 > xs1) { - for (int i = 0; i < m; ++i) - serial_axpy(n, alpha, X + i * xs0, Y + i * ys0, xs1, ys1); + for (int i = 0; i < m; ++i) serial_axpy(n, alpha, X + i * xs0, Y + i * ys0, xs1, ys1); } else { - for (int j = 0; j < n; ++j) - serial_axpy(m, alpha, X + j * xs1, Y + j * ys1, xs0, ys0); + for (int j = 0; j < n; ++j) serial_axpy(m, alpha, X + j * xs1, Y + j * ys1, xs0, ys0); } return; diff --git a/blas/impl/KokkosBlas_serial_nrm2.hpp b/blas/impl/KokkosBlas_serial_nrm2.hpp index 1b40ea32a8..db17736c0f 100644 --- a/blas/impl/KokkosBlas_serial_nrm2.hpp +++ b/blas/impl/KokkosBlas_serial_nrm2.hpp @@ -27,10 +27,8 @@ namespace Impl { /// Serial Internal Impl /// ==================== template -KOKKOS_INLINE_FUNCTION static - typename Kokkos::Details::InnerProductSpaceTraits::mag_type - serial_nrm2(const int m, const ValueType *KOKKOS_RESTRICT X, - const int xs0) { +KOKKOS_INLINE_FUNCTION static typename Kokkos::Details::InnerProductSpaceTraits::mag_type serial_nrm2( + const int m, const ValueType *KOKKOS_RESTRICT X, const int xs0) { using IPT = Kokkos::Details::InnerProductSpaceTraits; using norm_type = typename IPT::mag_type; @@ -39,21 +37,16 @@ KOKKOS_INLINE_FUNCTION static #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int i = 0; i < m; ++i) - nrm += IPT::norm(IPT::dot(X[i * xs0], X[i * xs0])); + for (int i = 0; i < m; ++i) nrm += IPT::norm(IPT::dot(X[i * xs0], X[i * xs0])); return Kokkos::ArithTraits::sqrt(nrm); } template KOKKOS_INLINE_FUNCTION static void serial_nrm2( - const int m, const int n, const ValueType *KOKKOS_RESTRICT X, const int xs0, - const int xs1, - typename Kokkos::Details::InnerProductSpaceTraits::mag_type - *KOKKOS_RESTRICT R, - const int ys0) { - for (int vecIdx = 0; vecIdx < n; ++vecIdx) - R[vecIdx * ys0] = serial_nrm2(m, X + vecIdx * xs1, xs0); + const int m, const int n, const ValueType *KOKKOS_RESTRICT X, const int xs0, const int xs1, + typename Kokkos::Details::InnerProductSpaceTraits::mag_type *KOKKOS_RESTRICT R, const int ys0) { + for (int vecIdx = 0; vecIdx < n; ++vecIdx) R[vecIdx * ys0] = serial_nrm2(m, X + vecIdx * xs1, xs0); return; } diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 1fc6b7d480..885625673f 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -135,12 +135,9 @@ namespace Impl { // Output params: // * teamsPerReduction: number of teams to use for each reduction template -void multipleReductionWorkDistribution(size_type length, - size_type numReductions, - size_type &teamsPerDot) { - constexpr size_type workPerTeam = 4096; // Amount of work per team - size_type appxNumTeams = - (length * numReductions) / workPerTeam; // Estimation for appxNumTeams +void multipleReductionWorkDistribution(size_type length, size_type numReductions, size_type &teamsPerDot) { + constexpr size_type workPerTeam = 4096; // Amount of work per team + size_type appxNumTeams = (length * numReductions) / workPerTeam; // Estimation for appxNumTeams // Adjust appxNumTeams in case it is too small or too large if (appxNumTeams < 1) appxNumTeams = 1; diff --git a/blas/src/KokkosBlas1_abs.hpp b/blas/src/KokkosBlas1_abs.hpp index bd63ccedf1..f3ea88bb03 100644 --- a/blas/src/KokkosBlas1_abs.hpp +++ b/blas/src/KokkosBlas1_abs.hpp @@ -46,19 +46,14 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "R is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::abs: RMV must be accessible from execution space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: RMV must be accessible from execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::abs: XMV must be accessible from execution space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: XMV must be accessible from execution space"); + static_assert(std::is_same::value, "KokkosBlas::abs: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -73,30 +68,25 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) { std::ostringstream os; os << "KokkosBlas::abs (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1); + << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. - using RMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RMV::device_type, Kokkos::MemoryTraits >; - using XMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits >; + using RMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename RMV::device_type, Kokkos::MemoryTraits >; + using XMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XMV::device_type, Kokkos::MemoryTraits >; RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Abs::abs(space, R_internal, - X_internal); + Impl::Abs::abs(space, R_internal, X_internal); } /// \brief R(i,j) = abs(X(i,j)) diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 5cd03dd7c7..788995679c 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -60,10 +60,8 @@ namespace KokkosBlas { /// \param Y [in/out] View of type YMV in which the results will be /// stored. template -void axpby(const execution_space& exec_space, const AV& a, const XMV& X, - const BV& b, const YMV& Y) { - using AxpbyTraits = - Impl::AxpbyUnificationAttemptTraits; +void axpby(const execution_space& exec_space, const AV& a, const XMV& X, const BV& b, const YMV& Y) { + using AxpbyTraits = Impl::AxpbyUnificationAttemptTraits; using InternalTypeA = typename AxpbyTraits::InternalTypeA; using InternalTypeX = typename AxpbyTraits::InternalTypeX; using InternalTypeB = typename AxpbyTraits::InternalTypeB; @@ -95,37 +93,28 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // and 'b' become scalars as well, eventually changing precision in // order to match the precisions of 'X' and 'Y'. // ******************************************************************** - if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && - AxpbyTraits::onDevice) { + if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && AxpbyTraits::onDevice) { // ****************************************************************** // We are in the exception situation for rule 2 // ****************************************************************** InternalTypeA internal_a(a); InternalTypeA internal_b(b); - Impl::Axpby::axpby(exec_space, internal_a, internal_X, - internal_b, internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // ****************************************************************** // We are in rule 1, that is, we are in a 'onHost' case now // ****************************************************************** - InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost< - AV, Impl::typeRank()>::getValue(a)); - InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost< - BV, Impl::typeRank()>::getValue(b)); + InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost()>::getValue(a)); + InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost()>::getValue(b)); - Impl::Axpby::axpby(exec_space, internal_a, internal_X, - internal_b, internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } } else if constexpr (AxpbyTraits::internalTypesAB_bothViews) { - constexpr bool internalLayoutA_isStride( - std::is_same_v); - constexpr bool internalLayoutB_isStride( - std::is_same_v); + constexpr bool internalLayoutA_isStride(std::is_same_v); + constexpr bool internalLayoutB_isStride(std::is_same_v); const size_t numScalarsA(Impl::getAmountOfScalarsInCoefficient(a)); const size_t numScalarsB(Impl::getAmountOfScalarsInCoefficient(b)); @@ -143,8 +132,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // ****************************************************************** // Prepare internal_a // ****************************************************************** - typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", - layoutStrideA); + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", layoutStrideA); if constexpr (AxpbyTraits::atInputLayoutA_isStride) { Kokkos::deep_copy(managed_a, a); } else { @@ -156,8 +144,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - layoutStrideB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", layoutStrideB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -168,16 +155,13 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - numScalarsB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", numScalarsB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -188,17 +172,14 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } } else { // ****************************************************************** // Prepare internal_a // ****************************************************************** - typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", - numScalarsA); + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", numScalarsA); if constexpr (AxpbyTraits::atInputLayoutA_isStride) { Kokkos::deep_copy(managed_a, a); } else { @@ -210,8 +191,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - layoutStrideB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", layoutStrideB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -222,16 +202,13 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } else { // **************************************************************** // Prepare internal_b // **************************************************************** - typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", - numScalarsB); + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", numScalarsB); if constexpr (AxpbyTraits::atInputLayoutB_isStride) { Kokkos::deep_copy(managed_b, b); } else { @@ -242,10 +219,8 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // **************************************************************** // Call Impl::Axpby<...>::axpby(...) // **************************************************************** - Impl::Axpby::axpby(exec_space, internal_a, - internal_X, internal_b, - internal_Y); + Impl::Axpby::axpby( + exec_space, internal_a, internal_X, internal_b, internal_Y); } } } @@ -299,10 +274,8 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { /// \param Y [in/out] View of type YMV in which the results will be /// stored. template -void axpy(const execution_space& exec_space, const AV& a, const XMV& X, - const YMV& Y) { - axpby(exec_space, a, X, - Kokkos::ArithTraits::one(), Y); +void axpy(const execution_space& exec_space, const AV& a, const XMV& X, const YMV& Y) { + axpby(exec_space, a, X, Kokkos::ArithTraits::one(), Y); } /// \brief Computes Y := a*X + Y @@ -334,23 +307,17 @@ void axpy(const AV& a, const XMV& X, const YMV& Y) { template KOKKOS_FUNCTION void serial_axpy(const scalar_type alpha, const XMV X, YMV Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); - static_assert(XMV::rank == 1 || XMV::rank == 2, - "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); - static_assert( - XMV::rank == YMV::rank, - "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); + static_assert(XMV::rank == 1 || XMV::rank == 2, "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); + static_assert(XMV::rank == YMV::rank, "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { Kokkos::abort("KokkosBlas::serial_axpy: X and Y dimensions do not match"); } #endif // KOKKOSKERNELS_DEBUG_LEVEL - return Impl::serial_axpy_mv(X.extent(0), X.extent(1), alpha, X.data(), - Y.data(), X.stride_0(), X.stride_1(), + return Impl::serial_axpy_mv(X.extent(0), X.extent(1), alpha, X.data(), Y.data(), X.stride_0(), X.stride_1(), Y.stride_0(), Y.stride_1()); } diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index aa995836eb..6e1a428b51 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -37,28 +37,19 @@ namespace KokkosBlas { /// /// \return The dot product result; a single value. template , - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type -dot(const execution_space& space, const XVector& x, const YVector& y) { + typename std::enable_if, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::dot_type dot( + const execution_space& space, const XVector& x, const YVector& y) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::dot: execution_space must be a valid Kokkos " "execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::dot: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: XVector must be accessible from execution_space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::dot: YVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: YVector must be accessible from execution_space"); - static_assert((int)XVector::rank == (int)YVector::rank, - "KokkosBlas::dot: Vector ranks do not match."); + static_assert(Kokkos::is_view::value, "KokkosBlas::dot: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::dot: YVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: YVector must be accessible from execution_space"); + static_assert((int)XVector::rank == (int)YVector::rank, "KokkosBlas::dot: Vector ranks do not match."); static_assert(XVector::rank == 1, "KokkosBlas::dot: " "Both Vector inputs must have rank 1."); @@ -72,29 +63,23 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits>; - using YVector_Internal = Kokkos::View< - typename YVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits>; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits>; + using YVector_Internal = Kokkos::View::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; - using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type; + using dot_type = typename Kokkos::Details::InnerProductSpaceTraits::dot_type; // result_type is usually just dot_type, except: // if dot_type is float, result_type is double // if dot_type is complex, result_type is complex // These special cases are to maintain accuracy. - using result_type = - typename KokkosBlas::Impl::DotAccumulatingScalar::type; + using result_type = typename KokkosBlas::Impl::DotAccumulatingScalar::type; using RVector_Internal = - Kokkos::View>; + Kokkos::View>; using RVector_Result = - Kokkos::View>; + Kokkos::View>; XVector_Internal X = x; YVector_Internal Y = y; @@ -108,24 +93,19 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { // two different scalar types. result_type result{}; RVector_Result R = RVector_Result(&result); - Impl::DotSpecialAccumulator::dot(space, - R, X, - Y); + Impl::DotSpecialAccumulator::dot(space, R, X, + Y); space.fence(); // mfh 22 Jan 2020: We need the line below because // Kokkos::complex lacks a constructor that takes a // Kokkos::complex with U != T. - return Kokkos::Details::CastPossiblyComplex::cast( - result); + return Kokkos::Details::CastPossiblyComplex::cast(result); } else { dot_type result{}; RVector_Internal R = RVector_Internal(&result); - Impl::Dot::dot(space, R, X, Y); + Impl::Dot::dot(space, R, X, Y); space.fence(); - return Kokkos::Details::CastPossiblyComplex::cast( - result); + return Kokkos::Details::CastPossiblyComplex::cast(result); } } @@ -142,9 +122,8 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { /// /// \return The dot product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type -dot(const XVector& x, const YVector& y) { +typename Kokkos::Details::InnerProductSpaceTraits::dot_type dot( + const XVector& x, const YVector& y) { return dot(typename XVector::execution_space{}, x, y); } @@ -192,35 +171,26 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: XMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "Y is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::dot: XMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::dot: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(RV::rank == 0 || RV::rank == 1, - "KokkosBlas::dot: R must have rank 0 or 1."); - static_assert(XMV::rank == 1 || XMV::rank == 2, - "KokkosBlas::dot: X must have rank 1 or 2."); - static_assert(YMV::rank == 1 || YMV::rank == 2, - "KokkosBlas::dot: Y must have rank 1 or 2."); - static_assert((XMV::rank == 2 && YMV::rank == 2 && RV::rank == 1) || - (XMV::rank == 1 && YMV::rank == 1 && RV::rank == 0) || - (XMV::rank == 2 && YMV::rank == 1 && RV::rank == 1) || - (XMV::rank == 1 && YMV::rank == 2 && RV::rank == 1), - "KokkosBlas::dot: Ranks of RV, XMV, and YMV don't match. " - "See this function's documentation for the allowed " - "combinations of ranks."); + static_assert(RV::rank == 0 || RV::rank == 1, "KokkosBlas::dot: R must have rank 0 or 1."); + static_assert(XMV::rank == 1 || XMV::rank == 2, "KokkosBlas::dot: X must have rank 1 or 2."); + static_assert(YMV::rank == 1 || YMV::rank == 2, "KokkosBlas::dot: Y must have rank 1 or 2."); + static_assert( + (XMV::rank == 2 && YMV::rank == 2 && RV::rank == 1) || (XMV::rank == 1 && YMV::rank == 1 && RV::rank == 0) || + (XMV::rank == 2 && YMV::rank == 1 && RV::rank == 1) || (XMV::rank == 1 && YMV::rank == 2 && RV::rank == 1), + "KokkosBlas::dot: Ranks of RV, XMV, and YMV don't match. " + "See this function's documentation for the allowed " + "combinations of ranks."); // Check compatibility of dimensions at run time. @@ -228,8 +198,7 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, bool dimsMatch = true; if (X.extent(0) != Y.extent(0)) { dimsMatch = false; - } else if (X.extent(1) != Y.extent(1) && X.extent(1) != 1 && - Y.extent(1) != 1) { + } else if (X.extent(1) != Y.extent(1) && X.extent(1) != 1 && Y.extent(1) != 1) { // Numbers of columns don't match, and neither X nor Y have one column. dimsMatch = false; } @@ -244,43 +213,33 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, if (RV::rank == 1) { os << "R: " << R.extent(0) << " x " << X.extent(1) << ", "; } - os << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0) - << " x " << Y.extent(1); + os << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0) << " x " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; - typedef Kokkos::View::type, - UnifiedRVLayout, typename RV::device_type, - Kokkos::MemoryTraits> + typedef Kokkos::View::type, + UnifiedRVLayout, typename RV::device_type, Kokkos::MemoryTraits> RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, - Kokkos::MemoryTraits> + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits> XMV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YMV::device_type, Kokkos::MemoryTraits> + typedef Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename YMV::device_type, + Kokkos::MemoryTraits> YMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; - Impl::Dot::dot( - space, R_internal, X_internal, Y_internal); + Impl::Dot::dot(space, R_internal, X_internal, Y_internal); } /// \brief Compute the column-wise dot products of two multivectors. @@ -314,8 +273,7 @@ void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, /// doesn't confuse this version of dot() with the three-argument /// version of dot() in Kokkos_Blas1.hpp. template -void dot(const RV& R, const XMV& X, const YMV& Y, - typename std::enable_if::value, int>::type = 0) { +void dot(const RV& R, const XMV& X, const YMV& Y, typename std::enable_if::value, int>::type = 0) { dot(typename XMV::execution_space{}, R, X, Y); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_fill.hpp b/blas/src/KokkosBlas1_fill.hpp index 403411f7b8..486ee46c71 100644 --- a/blas/src/KokkosBlas1_fill.hpp +++ b/blas/src/KokkosBlas1_fill.hpp @@ -33,8 +33,7 @@ namespace KokkosBlas { /// \param X [out] Output View (1-D or 2-D). /// \param val [in] Value with which to fill the entries of X. template -void fill(const execution_space& space, const XMV& X, - const typename XMV::non_const_value_type& val) { +void fill(const execution_space& space, const XMV& X, const typename XMV::non_const_value_type& val) { Kokkos::Profiling::pushRegion("KokkosBlas::fill"); Kokkos::deep_copy(space, X, val); Kokkos::Profiling::popRegion(); diff --git a/blas/src/KokkosBlas1_iamax.hpp b/blas/src/KokkosBlas1_iamax.hpp index cfaaaeed63..4b69f8d507 100644 --- a/blas/src/KokkosBlas1_iamax.hpp +++ b/blas/src/KokkosBlas1_iamax.hpp @@ -36,43 +36,35 @@ namespace KokkosBlas { /// single value. /// Note: Returned index is 1-based for compatibility with Fortran. template , - int>::type = 0> -typename XVector::size_type iamax(const execution_space& space, - const XVector& x) { + typename std::enable_if, int>::type = 0> +typename XVector::size_type iamax(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::iamax: execution_space must be a valid Kokkos " "execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::iamax: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::iamax: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::iamax: " "Both Vector inputs must have rank 1."); typedef typename XVector::size_type index_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; index_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Iamax::iamax(space, - R, X); + Impl::Iamax::iamax(space, R, X); space.fence(); return result; } @@ -122,17 +114,13 @@ void iamax(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::iamax: XMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::iamax: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::iamax: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); @@ -146,41 +134,32 @@ void iamax(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::iamax (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV may be rank 0 or rank 1. // XMV may be rank 1 or rank 2. - typedef Kokkos::View::type, - UnifiedRVLayout, - typename std::conditional< - std::is_same::value, - Kokkos::HostSpace, typename RV::device_type>::type, - Kokkos::MemoryTraits > - RV_Internal; typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, + typename std::conditional::type, + UnifiedRVLayout, + typename std::conditional::value, + Kokkos::HostSpace, typename RV::device_type>::type, Kokkos::MemoryTraits > + RV_Internal; + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Iamax::iamax( - space, R_internal, X_internal); + Impl::Iamax::iamax(space, R_internal, X_internal); } /// \brief R(j) = iamax(X(i,j)) @@ -197,8 +176,7 @@ void iamax(const execution_space& space, const RV& R, const XMV& X, /// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a /// view, RMV must be 0-D view and XMV must be 1-D view. template -void iamax(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void iamax(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { iamax(typename XMV::execution_space{}, R, X); } diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 32ede3090c..9d76d6a822 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -41,75 +41,56 @@ namespace KokkosBlas { /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. template -void mult(const execution_space& space, typename YMV::const_value_type& gamma, - const YMV& Y, typename AV::const_value_type& alpha, const AV& A, - const XMV& X) { +void mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, + typename AV::const_value_type& alpha, const AV& A, const XMV& X) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::mult: execution_space must be a valid Kokkos " "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "Y is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: YMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: YMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "A is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: AV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::mult: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert( - (XMV::rank == 1 && YMV::rank == 1) || (XMV::rank == 2 && YMV::rank == 2), - "KokkosBlas::mult: Y and X must be either both rank 1, " - "or both rank 2."); + static_assert((XMV::rank == 1 && YMV::rank == 1) || (XMV::rank == 2 && YMV::rank == 2), + "KokkosBlas::mult: Y and X must be either both rank 1, " + "or both rank 2."); static_assert(AV::rank == 1, "KokkosBlas::mult: A must have rank 1."); // Check compatibility of dimensions at run time. - if (Y.extent(0) != A.extent(0) || Y.extent(0) != X.extent(0) || - Y.extent(1) != X.extent(1)) { + if (Y.extent(0) != A.extent(0) || Y.extent(0) != X.extent(0) || Y.extent(1) != X.extent(1)) { std::ostringstream os; os << "KokkosBlas::mult: Dimensions do not match: " - << "Y: " << Y.extent(0) << " x " << Y.extent(1) << ", A: " << A.extent(0) - << " x " << A.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "Y: " << Y.extent(0) << " x " << Y.extent(1) << ", A: " << A.extent(0) << " x " << A.extent(0) + << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using YUnifiedLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using AUnifiedLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - AV, YUnifiedLayout>::array_layout; - using XUnifiedLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - XMV, YUnifiedLayout>::array_layout; + using YUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using AUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; + using XUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. - typedef Kokkos::View > YMV_Internal; - typedef Kokkos::View > AV_Internal; - typedef Kokkos::View > XMV_Internal; @@ -117,8 +98,8 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, AV_Internal A_internal = A; XMV_Internal X_internal = X; - Impl::Mult::mult( - space, gamma, Y_internal, alpha, A_internal, X_internal); + Impl::Mult::mult(space, gamma, Y_internal, alpha, + A_internal, X_internal); } /// \brief Element wise multiplication of two vectors: @@ -138,8 +119,8 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. template -void mult(typename YMV::const_value_type& gamma, const YMV& Y, - typename AV::const_value_type& alpha, const AV& A, const XMV& X) { +void mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, + const XMV& X) { mult(typename YMV::execution_space{}, gamma, Y, alpha, A, X); } diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index e9b26e6177..bf7119a585 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -33,39 +33,30 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm1 product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm1(const execution_space& space, const XVector& x) { - static_assert( - Kokkos::is_execution_space::value, - "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm1: XVector must be a Kokkos::View."); +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm1( + const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, "KokkosBlas::nrm1: " "Both Vector inputs must have rank 1."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using RVector_Internal = - Kokkos::View >; + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm1::nrm1(space, - R, X); + Impl::Nrm1::nrm1(space, R, X); space.fence(); return result; } @@ -78,9 +69,8 @@ nrm1(const execution_space& space, const XVector& x) { /// /// \return The nrm1 product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm1(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm1( + const XVector& x) { return nrm1(typename XVector::execution_space{}, x); } @@ -109,22 +99,17 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: " "X is not a Kokkos::View."); - static_assert(std::is_same::value, + static_assert(std::is_same::value, "KokkosBlas::nrm1: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm1: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm1: execution_space cannot access data in XMV"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm1: execution_space cannot access data in XMV"); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm1: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -134,37 +119,28 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm1 (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View::type, - UnifiedRVLayout, typename RV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedRVLayout, typename RV::device_type, Kokkos::MemoryTraits > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm1::nrm1( - space, R_internal, X_internal); + Impl::Nrm1::nrm1(space, R_internal, X_internal); } /// \brief R(j) = nrm1(X(i,j)) @@ -182,16 +158,14 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, /// \param R [out] Output 1-D View containing the result /// \param X [in] Input 1-D View. template -void nrm1(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm1(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrm1(typename XMV::execution_space{}, R, X); } /// \brief Return the nrm1 of the vector x via asum (the actual blas name). template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -asum(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type asum( + const XVector& x) { return nrm1(x); } diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index 59f105f5a4..2e8558ba32 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -34,46 +34,36 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2(const execution_space& space, const XVector& x) { +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2( + const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2: execution_space must be a valid" " Kokkos execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2: " "XVector must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Nrm2::nrm2( - space, R, X, true); + Impl::Nrm2::nrm2(space, R, X, true); space.fence(); return result; } @@ -89,9 +79,8 @@ nrm2(const execution_space& space, const XVector& x) { /// /// \return The nrm2 product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2( + const XVector& x) { return nrm2(typename XVector::execution_space{}, x); } @@ -122,22 +111,17 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2: X cannot be accessed from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: X cannot be accessed from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::nrm2: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -147,33 +131,26 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2 (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2( - space, R_internal, X_internal, true); + Impl::Nrm2::nrm2(space, R_internal, X_internal, true); } /// \brief R(i,j) = nrm2(X(i,j)) @@ -193,8 +170,7 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, /// \param R [out] Output View containing results (rank 0 or 1). /// \param X [in] Input View (rank 1 or 2). template -void nrm2(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm2(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrm2(typename XMV::execution_space{}, R, X); } @@ -202,14 +178,11 @@ void nrm2(const RV& R, const XMV& X, /// Serial nrm2 /// template -KOKKOS_INLINE_FUNCTION typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type +KOKKOS_INLINE_FUNCTION typename Kokkos::Details::InnerProductSpaceTraits::mag_type serial_nrm2(const XMV X) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); - static_assert(XMV::rank == 1, - "KokkosBlas::serial_nrm2: XMV must have rank 1"); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(XMV::rank == 1, "KokkosBlas::serial_nrm2: XMV must have rank 1"); #endif // KOKKOSKERNELS_DEBUG_LEVEL return Impl::serial_nrm2(X.extent(0), X.data(), X.stride_0()); @@ -219,26 +192,20 @@ template KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { // Do some compile time check when debug is enabled #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::serial_nrm2: RV is not a Kokkos::View"); - static_assert(std::is_same::value, + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: RV is not a Kokkos::View"); + static_assert(std::is_same::value, "KokkosBlas::serial_nrm2: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::serial_nrm2: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type; - static_assert( - std::is_same::value, - "KokkosBlas::serial_nrm2: RV must have same value_type as" - " Kokkos::ArithTraits::mag_type"); + using norm_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; + static_assert(std::is_same::value, + "KokkosBlas::serial_nrm2: RV must have same value_type as" + " Kokkos::ArithTraits::mag_type"); if (R.extent(0) != X.extent(1)) { Kokkos::printf( @@ -249,8 +216,7 @@ KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { } #endif // KOKKOSKERNELS_DEBUG_LEVEL - Impl::serial_nrm2(X.extent(0), X.extent(1), X.data(), X.stride_0(), - X.stride_1(), R.data(), R.stride_0()); + Impl::serial_nrm2(X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), R.data(), R.stride_0()); return 0; } diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index c065efb290..748ece3663 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -33,46 +33,36 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2_squared(const execution_space& space, const XVector& x) { +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2_squared( + const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2_squared: execution_space must be a valid" " Kokkos execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2_squared: XVector must be accessible" - " from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2_squared: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm2::nrm2( - space, R, X, false); + Impl::Nrm2::nrm2(space, R, X, false); space.fence(); return result; } @@ -88,9 +78,8 @@ nrm2_squared(const execution_space& space, const XVector& x) { /// /// \return The nrm2 product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2_squared(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2_squared( + const XVector& x) { return nrm2_squared(typename XVector::execution_space{}, x); } @@ -111,9 +100,8 @@ nrm2_squared(const XVector& x) { /// \param R [in] Output View (rank 0 or 1) that holds the result. /// \param X [in] Input View (rank 1 or 2). template -void nrm2_squared( - const execution_space& space, const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm2_squared(const execution_space& space, const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2_squared: execution_space must be a valid" " Kokkos execution space"); @@ -123,22 +111,17 @@ void nrm2_squared( static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2_squared: XVector must be accessible" - " from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::nrm2_squared: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2_squared: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -148,33 +131,26 @@ void nrm2_squared( if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2 (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2( - space, R_internal, X_internal, false); + Impl::Nrm2::nrm2(space, R_internal, X_internal, false); } /// \brief R(i,j) = nrm2(X(i,j)) @@ -190,9 +166,7 @@ void nrm2_squared( /// the same rank as RMV, and its entries must be assignable to /// those of RMV. template -void nrm2_squared( - const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrm2_squared(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrm2_squared(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index c5eaa0621b..5fea0c783c 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -36,44 +36,35 @@ namespace KokkosBlas { /// /// \return The nrm2w product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w(const execution_space& space, const XVector& x, const XVector& w, - typename std::enable_if< - Kokkos::is_execution_space::value, int>::type = 0) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w( + const execution_space& space, const XVector& x, const XVector& w, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w: execution_space must be a valid" " Kokkos execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2w: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w: " "Both Vector inputs must have rank 1."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; using RVector_Internal = - Kokkos::View >; + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w( - space, R, X, W, true); + Impl::Nrm2w::nrm2w(space, R, X, W, true); space.fence(); return result; } @@ -90,9 +81,8 @@ nrm2w(const execution_space& space, const XVector& x, const XVector& w, /// /// \return The nrm2w product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w(const XVector& x, const XVector& w) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w( + const XVector& x, const XVector& w) { return nrm2w(typename XVector::execution_space{}, x, w); } @@ -114,8 +104,7 @@ nrm2w(const XVector& x, const XVector& w) { /// \param X [in] Input View (rank 1 or 2). /// \param W [in] Input View (rank 1 or 2). template -void nrm2w(const execution_space& space, const RV& R, const XMV& X, - const XMV& W, +void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w: execution_space must be a valid" @@ -126,22 +115,17 @@ void nrm2w(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::nrm2w: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2w: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -151,25 +135,19 @@ void nrm2w(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2w (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; @@ -177,8 +155,7 @@ void nrm2w(const execution_space& space, const RV& R, const XMV& X, XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w( - space, R_internal, X_internal, W_internal, true); + Impl::Nrm2w::nrm2w(space, R_internal, X_internal, W_internal, true); } /// \brief R(i,j) = nrm2w(X(i,j)) diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index a1fe10bf1e..375a55c294 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -34,49 +34,38 @@ namespace KokkosBlas { /// \param w [in] Input weights (1-D View). /// /// \return The nrm2w product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w_squared(const execution_space& space, const XVector& x, - const XVector& w) { +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w_squared( + const execution_space& space, const XVector& x, const XVector& w) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w_squared: execution_space must be a valid " "Kokkos execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w_squared: XVector must be accessible from " - "execution_space."); + static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w_squared: " "Both Vector inputs must have rank 1."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; using RVector_Internal = - Kokkos::View >; + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w( - space, R, X, W, false); + Impl::Nrm2w::nrm2w(space, R, X, W, false); space.fence(); return result; } @@ -93,9 +82,8 @@ nrm2w_squared(const execution_space& space, const XVector& x, /// /// \return The nrm2w product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrm2w_squared(const XVector& x, const XVector& w) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrm2w_squared( + const XVector& x, const XVector& w) { return nrm2w_squared(typename XVector::execution_space(), x, w); } @@ -117,9 +105,8 @@ nrm2w_squared(const XVector& x, const XVector& w) { /// \param X [in] Input View (rank 1 or 2). /// \param W [in] Input View (rank 1 or 2). template -void nrm2w_squared( - const execution_space& space, const RV& R, const XMV& X, const XMV& W, - typename std::enable_if::value, int>::type = 0) { +void nrm2w_squared(const execution_space& space, const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrm2w_squared: execution_space must be a valid " "Kokkos execution space."); @@ -129,22 +116,17 @@ void nrm2w_squared( static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w_squared: XVector must be accessible from " - "execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); + static_assert(std::is_same::value, "KokkosBlas::nrm2w_squared: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w_squared: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits::mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2w: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -154,32 +136,25 @@ void nrm2w_squared( if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrm2w (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - using RV_Internal = Kokkos::View >; - using XMV_Internal = Kokkos::View >; RV_Internal R_internal = R; XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w( - space, R_internal, X_internal, W_internal, false); + Impl::Nrm2w::nrm2w(space, R_internal, X_internal, W_internal, false); } /// \brief R(i,j) = nrm2w(X(i,j)) @@ -199,9 +174,8 @@ void nrm2w_squared( /// \param X [in] Input View (rank 1 or 2). /// \param W [in] Input View (rank 1 or 2). template -void nrm2w_squared( - const RV& R, const XMV& X, const XMV& W, - typename std::enable_if::value, int>::type = 0) { +void nrm2w_squared(const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { nrm2w_squared(typename XMV::execution_space{}, R, X, W); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrminf.hpp b/blas/src/KokkosBlas1_nrminf.hpp index c6f923aefe..ec3a98fa95 100644 --- a/blas/src/KokkosBlas1_nrminf.hpp +++ b/blas/src/KokkosBlas1_nrminf.hpp @@ -33,39 +33,31 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrminf product result; a single value. -template < - class execution_space, class XVector, - typename std::enable_if::value, - int>::type = 0> -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrminf(const execution_space& space, const XVector& x) { - static_assert(Kokkos::is_view::value, - "KokkosBlas::nrminf: XVector must be a Kokkos::View."); +template ::value, int>::type = 0> +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrminf( + const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, "KokkosBlas::nrminf: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::NrmInf::nrminf( - space, R, X); + Impl::NrmInf::nrminf(space, R, X); space.fence(); return result; } @@ -78,9 +70,8 @@ nrminf(const execution_space& space, const XVector& x) { /// /// \return The nrminf product result; a single value. template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type -nrminf(const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type nrminf( + const XVector& x) { return nrminf(typename XVector::execution_space{}, x); } @@ -95,9 +86,8 @@ nrminf(const XVector& x) { /// the same rank as RMV, and its entries must be assignable to /// those of RMV. template -void nrminf( - const execution_space& space, const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrminf(const execution_space& space, const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, "KokkosBlas::nrminf: space is not an execution space instance"); static_assert(Kokkos::is_view::value, @@ -106,22 +96,17 @@ void nrminf( static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrminf: X is not accessible from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrminf: X is not accessible from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::nrminf: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrminf: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + typedef typename Kokkos::Details::InnerProductSpaceTraits::mag_type mag_type; static_assert(std::is_same::value, "KokkosBlas::nrminf: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -131,37 +116,28 @@ void nrminf( if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::nrminf (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View::type, - UnifiedRVLayout, typename RV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedRVLayout, typename RV::device_type, Kokkos::MemoryTraits > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - UnifiedXLayout, typename XMV::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View::type, + UnifiedXLayout, typename XMV::device_type, Kokkos::MemoryTraits > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::NrmInf::nrminf( - space, R_internal, X_internal); + Impl::NrmInf::nrminf(space, R_internal, X_internal); } /// \brief R(j) = nrminf(X(i,j)) @@ -174,9 +150,7 @@ void nrminf( /// the same rank as RMV, and its entries must be assignable to /// those of RMV. template -void nrminf( - const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void nrminf(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrminf(typename XMV::execution_space{}, R, X); } diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index ef73d26828..477c885e5e 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -47,19 +47,14 @@ void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "R is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); + static_assert(std::is_same::value, "KokkosBlas::reciprocal: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -74,32 +69,27 @@ void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) { std::ostringstream os; os << "KokkosBlas::reciprocal (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1); + << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename RMV::device_type, + Kokkos::MemoryTraits > RMV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename XMV::device_type, + Kokkos::MemoryTraits > XMV_Internal; RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Reciprocal::reciprocal( - space, R_internal, X_internal); + Impl::Reciprocal::reciprocal(space, R_internal, X_internal); } /// \brief R(i,j) = reciprocal(X(i,j)) diff --git a/blas/src/KokkosBlas1_rot.hpp b/blas/src/KokkosBlas1_rot.hpp index d848617b6e..7bc3215604 100644 --- a/blas/src/KokkosBlas1_rot.hpp +++ b/blas/src/KokkosBlas1_rot.hpp @@ -22,58 +22,45 @@ namespace KokkosBlas { template -void rot(execution_space const& space, VectorView const& X, VectorView const& Y, - ScalarView const& c, ScalarView const& s) { +void rot(execution_space const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, + ScalarView const& s) { static_assert(Kokkos::is_execution_space::value, "rot: execution_space template parameter is not a Kokkos " "execution space."); - static_assert(VectorView::rank == 1, - "rot: VectorView template parameter needs to be a rank 1 view"); - static_assert(ScalarView::rank == 0, - "rot: ScalarView template parameter needs to be a rank 0 view"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rot: VectorView template parameter memory space needs to be accessible " - "from " - "execution_space template parameter"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rot: VectorView template parameter memory space needs to be accessible " - "from " - "execution_space template parameter"); - static_assert( - std::is_same::value, - "rot: VectorView template parameter needs to store non-const values"); + static_assert(VectorView::rank == 1, "rot: VectorView template parameter needs to be a rank 1 view"); + static_assert(ScalarView::rank == 0, "rot: ScalarView template parameter needs to be a rank 0 view"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rot: VectorView template parameter memory space needs to be accessible " + "from " + "execution_space template parameter"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rot: VectorView template parameter memory space needs to be accessible " + "from " + "execution_space template parameter"); + static_assert(std::is_same::value, + "rot: VectorView template parameter needs to store non-const values"); - using VectorView_Internal = Kokkos::View< - typename VectorView::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using VectorView_Internal = Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; - using ScalarView_Internal = Kokkos::View< - typename ScalarView::non_const_value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using ScalarView_Internal = Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; VectorView_Internal X_(X), Y_(Y); ScalarView_Internal c_(c), s_(s); Kokkos::Profiling::pushRegion("KokkosBlas::rot"); - Impl::Rot::rot( - space, X_, Y_, c_, s_); + Impl::Rot::rot(space, X_, Y_, c_, s_); Kokkos::Profiling::popRegion(); } template -void rot(VectorView const& X, VectorView const& Y, ScalarView const& c, - ScalarView const& s) { - const typename VectorView::execution_space space = - typename VectorView::execution_space(); +void rot(VectorView const& X, VectorView const& Y, ScalarView const& c, ScalarView const& s) { + const typename VectorView::execution_space space = typename VectorView::execution_space(); rot(space, X, Y, c, s); } diff --git a/blas/src/KokkosBlas1_rotg.hpp b/blas/src/KokkosBlas1_rotg.hpp index 3b66ae0115..1927bc2df9 100644 --- a/blas/src/KokkosBlas1_rotg.hpp +++ b/blas/src/KokkosBlas1_rotg.hpp @@ -35,40 +35,28 @@ namespace KokkosBlas { /// rotation /// \param s [out] sine value associated with the rotation template -void rotg(execution_space const& space, SViewType const& a, SViewType const& b, - MViewType const& c, SViewType const& s) { - static_assert(SViewType::rank == 0, - "rotg: the inputs need to be rank 0 views"); - static_assert(MViewType::rank == 0, - "rotg: the inputs need to be rank 0 views"); - static_assert( - !Kokkos::ArithTraits::is_complex); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotg: execution_space cannot access data in SViewType"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotg: execution_space cannot access data in MViewType"); +void rotg(execution_space const& space, SViewType const& a, SViewType const& b, MViewType const& c, + SViewType const& s) { + static_assert(SViewType::rank == 0, "rotg: the inputs need to be rank 0 views"); + static_assert(MViewType::rank == 0, "rotg: the inputs need to be rank 0 views"); + static_assert(!Kokkos::ArithTraits::is_complex); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotg: execution_space cannot access data in SViewType"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotg: execution_space cannot access data in MViewType"); using SView_Internal = Kokkos::View< - typename SViewType::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + typename SViewType::value_type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits>; using MView_Internal = Kokkos::View< - typename MViewType::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + typename MViewType::value_type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits>; SView_Internal a_(a), b_(b), s_(s); MView_Internal c_(c); Kokkos::Profiling::pushRegion("KokkosBlas::rotg"); - Impl::Rotg::rotg(space, a, b, - c, s); + Impl::Rotg::rotg(space, a, b, c, s); Kokkos::Profiling::popRegion(); } diff --git a/blas/src/KokkosBlas1_rotm.hpp b/blas/src/KokkosBlas1_rotm.hpp index 077d3350fe..6f5442e931 100644 --- a/blas/src/KokkosBlas1_rotm.hpp +++ b/blas/src/KokkosBlas1_rotm.hpp @@ -36,62 +36,45 @@ namespace KokkosBlas { /// \param param [in] output of rotmg contains rotation coefficients /// template -void rotm(execution_space const& space, VectorView const& X, - VectorView const& Y, ParamView const& param) { +void rotm(execution_space const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { static_assert(Kokkos::is_execution_space::value, "rotm: execution_space template parameter is not a Kokkos " "execution space."); - static_assert( - VectorView::rank == 1, - "rotm: VectorView template parameter needs to be a rank 1 view"); - static_assert(ParamView::rank == 1, - "rotm: ParamView template parameter needs to be a rank 1 view"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotm: VectorView template parameter memory space needs to be accessible " - "from execution_space template parameter"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotm: ScalarView template parameter memory space needs to be accessible " - "from execution_space template parameter"); - static_assert( - std::is_same::value, - "rotm: VectorView template parameter needs to store non-const values"); - static_assert( - !Kokkos::ArithTraits::is_complex, - "rotm: VectorView template parameter cannot use complex value_type"); - static_assert( - !Kokkos::ArithTraits::is_complex, - "rotm: ParamView template parameter cannot use complex value_type"); + static_assert(VectorView::rank == 1, "rotm: VectorView template parameter needs to be a rank 1 view"); + static_assert(ParamView::rank == 1, "rotm: ParamView template parameter needs to be a rank 1 view"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotm: VectorView template parameter memory space needs to be accessible " + "from execution_space template parameter"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotm: ScalarView template parameter memory space needs to be accessible " + "from execution_space template parameter"); + static_assert(std::is_same::value, + "rotm: VectorView template parameter needs to store non-const values"); + static_assert(!Kokkos::ArithTraits::is_complex, + "rotm: VectorView template parameter cannot use complex value_type"); + static_assert(!Kokkos::ArithTraits::is_complex, + "rotm: ParamView template parameter cannot use complex value_type"); - using VectorView_Internal = Kokkos::View< - typename VectorView::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using VectorView_Internal = Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; using ParamView_Internal = Kokkos::View< - typename ParamView::const_value_type[5], - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + typename ParamView::const_value_type[5], typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits>; VectorView_Internal X_(X), Y_(Y); ParamView_Internal param_(param); Kokkos::Profiling::pushRegion("KokkosBlas::rotm"); - Impl::Rotm::rotm( - space, X_, Y_, param_); + Impl::Rotm::rotm(space, X_, Y_, param_); Kokkos::Profiling::popRegion(); } template void rotm(VectorView const& X, VectorView const& Y, ParamView const& param) { - const typename VectorView::execution_space space = - typename VectorView::execution_space(); + const typename VectorView::execution_space space = typename VectorView::execution_space(); rotm(space, X, Y, param); } diff --git a/blas/src/KokkosBlas1_rotmg.hpp b/blas/src/KokkosBlas1_rotmg.hpp index 723b0eac1a..a6c629f987 100644 --- a/blas/src/KokkosBlas1_rotmg.hpp +++ b/blas/src/KokkosBlas1_rotmg.hpp @@ -39,46 +39,39 @@ namespace KokkosBlas { /// \param param [out] /// template -void rotmg(execution_space const& space, DXView const& d1, DXView const& d2, - DXView const& x1, YView const& y1, PView const& param) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "rotmg: execution_space cannot access data in DXView"); +void rotmg(execution_space const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, + PView const& param) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "rotmg: execution_space cannot access data in DXView"); - using DXView_Internal = Kokkos::View< - typename DXView::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using DXView_Internal = + Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; - using YView_Internal = Kokkos::View< - typename YView::value_type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using YView_Internal = + Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; - using PView_Internal = Kokkos::View< - typename PView::value_type[5], - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits>; + using PView_Internal = + Kokkos::View::array_layout, + Kokkos::Device, + Kokkos::MemoryTraits>; DXView_Internal d1_(d1), d2_(d2), x1_(x1); YView_Internal y1_(y1); PView_Internal param_(param); Kokkos::Profiling::pushRegion("KokkosBlas::rotmg"); - Impl::Rotmg::rotmg(space, d1_, d2_, x1_, y1_, param_); + Impl::Rotmg::rotmg(space, d1_, d2_, x1_, y1_, + param_); Kokkos::Profiling::popRegion(); } template -void rotmg(DXView const& d1, DXView const& d2, DXView const& x1, - YView const& y1, PView const& param) { - const typename PView::execution_space space = - typename PView::execution_space(); +void rotmg(DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, PView const& param) { + const typename PView::execution_space space = typename PView::execution_space(); rotmg(space, d1, d2, x1, y1, param); } diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 39c197f352..561c505035 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -44,31 +44,23 @@ namespace KokkosBlas { /// \param a [in] view of type AV, scaling parameter for X. /// \param X [in] input view of type XMV. template -void scal(const execution_space& space, const RMV& R, const AV& a, - const XMV& X) { +void scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::scal: execution_space must be a valid Kokkos " "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "R is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::scal: RMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: RMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::scal: XMV must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::scal: XMV must be assignable to RMV"); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: XMV must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::scal: XMV must be assignable to RMV"); + static_assert(std::is_same::value, "KokkosBlas::scal: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -83,36 +75,27 @@ void scal(const execution_space& space, const RMV& R, const AV& a, if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) { std::ostringstream os; os << "KokkosBlas::scal: Dimensions of R and X do not match: " - << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1); + << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedRLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - XMV, UnifiedRLayout>::array_layout; + using UnifiedRLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. AV may be either a rank-1 View, or a scalar // value. - using RMV_Internal = Kokkos::View >; - using XMV_Internal = Kokkos::View >; - using AV_Internal = - typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; + using AV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; RMV_Internal R_internal = R; AV_Internal a_internal = a; XMV_Internal X_internal = X; - Impl::Scal::scal( - space, R_internal, a_internal, X_internal); + Impl::Scal::scal(space, R_internal, a_internal, X_internal); } /// \brief Computes R := alpha*X @@ -140,10 +123,8 @@ void scal(const RMV& R, const AV& a, const XMV& X) { struct SerialScale { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType& A) { - return Impl::SerialScaleInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType& A) { + return Impl::SerialScaleInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -154,11 +135,8 @@ struct SerialScale { template struct TeamScale { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const ScalarType alpha, - const AViewType& A) { - return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A) { + return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -170,12 +148,9 @@ struct TeamScale { template struct TeamVectorScale { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const ScalarType alpha, - const AViewType& A) { - return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), - A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const ScalarType alpha, const AViewType& A) { + return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1()); } }; diff --git a/blas/src/KokkosBlas1_set.hpp b/blas/src/KokkosBlas1_set.hpp index ea31ff6282..6a6a5e0f22 100644 --- a/blas/src/KokkosBlas1_set.hpp +++ b/blas/src/KokkosBlas1_set.hpp @@ -27,10 +27,8 @@ namespace KokkosBlas { struct SerialSet { template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A) { - return Impl::SerialSetInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A) { + return Impl::SerialSetInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -41,12 +39,8 @@ struct SerialSet { template struct TeamSet { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - return Impl::TeamSetInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), - A.stride_1()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + return Impl::TeamSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; @@ -57,11 +51,8 @@ struct TeamSet { template struct TeamVectorSet { template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - return Impl::TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { + return Impl::TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); } }; diff --git a/blas/src/KokkosBlas1_sum.hpp b/blas/src/KokkosBlas1_sum.hpp index 88c7b10021..dffd19382e 100644 --- a/blas/src/KokkosBlas1_sum.hpp +++ b/blas/src/KokkosBlas1_sum.hpp @@ -33,40 +33,32 @@ namespace KokkosBlas { /// /// \return The sum product result; a single value. template , - int>::type = 0> -typename XVector::non_const_value_type sum(const execution_space& space, - const XVector& x) { + typename std::enable_if, int>::type = 0> +typename XVector::non_const_value_type sum(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::sum: execution_space must be a valid Kokkos " "execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::sum: XVector must be a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::sum: XVector must be accessible from execution_space."); + static_assert(Kokkos::is_view::value, "KokkosBlas::sum: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XVector must be accessible from execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::sum: " "Both Vector inputs must have rank 1."); - using XVector_Internal = Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; - using RVector_Internal = - Kokkos::View >; + using RVector_Internal = Kokkos::View >; typename XVector::non_const_value_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Sum::sum(space, R, - X); + Impl::Sum::sum(space, R, X); space.fence(); return result; } @@ -113,17 +105,13 @@ void sum(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::sum: " "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::sum: XMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::sum: R is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert(((RV::rank == 0) && (XMV::rank == 1)) || - ((RV::rank == 1) && (XMV::rank == 2)), + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::sum: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); @@ -131,33 +119,26 @@ void sum(const execution_space& space, const RV& R, const XMV& X, if (X.extent(1) != R.extent(0)) { std::ostringstream os; os << "KokkosBlas::sum (MV): Dimensions of R and X do not match: " - << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " - << X.extent(1); + << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x " << X.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedRVLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + using UnifiedXLayout = typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > RV_Internal; - typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Sum::sum(space, R_internal, - X_internal); + Impl::Sum::sum(space, R_internal, X_internal); } /// \brief R(j) = sum(X(i,j)) @@ -176,8 +157,7 @@ void sum(const execution_space& space, const RV& R, const XMV& X, /// \param R [out] Output View (rank 0 or 1) containing the results. /// \param X [in] Input View (rank 1 or 2). template -void sum(const RV& R, const XMV& X, - typename std::enable_if::value, int>::type = 0) { +void sum(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { sum(typename XMV::execution_space{}, R, X); } diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index 9ddcd106df..30155f5d44 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -42,44 +42,32 @@ namespace KokkosBlas { template void swap(execution_space const& space, XVector const& x, YVector const& y) { // Assert properties of XVector - static_assert(Kokkos::is_view::value, - "KokkosBlas::swap: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::swap: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, "KokkosBlas::swap: " "Input vector x must have rank 1."); - static_assert(std::is_same_v, + static_assert(std::is_same_v, "KokkosBlas::swap: XVector must store non const values."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "swap: execution_space cannot access data in XVector"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "swap: execution_space cannot access data in XVector"); // Assert properties of YVector, could probably use a function for this as // XVector and YVector are required to have identical properties... - static_assert(Kokkos::is_view::value, - "KokkosBlas::swap: YVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::swap: YVector must be a Kokkos::View."); static_assert(YVector::rank == 1, "KokkosBlas::swap: " "Input vector y must have rank 1."); - static_assert(std::is_same_v, + static_assert(std::is_same_v, "KokkosBlas::swap: YVector must store non const values."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "swap: execution_space cannot access data in YVector"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "swap: execution_space cannot access data in YVector"); using XVector_Internal = Kokkos::View< - typename XVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits >; + typename XVector::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits >; using YVector_Internal = Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - Kokkos::Device, - Kokkos::MemoryTraits >; + typename YVector::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + Kokkos::Device, Kokkos::MemoryTraits >; XVector_Internal X(x); YVector_Internal Y(y); @@ -92,8 +80,7 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { Kokkos::Profiling::pushRegion("KokkosBlas::swap"); // If X.extent(0) == 0, do nothing if (X.extent(0) != 0) { - Impl::Swap::swap(space, - X, Y); + Impl::Swap::swap(space, X, Y); } Kokkos::Profiling::popRegion(); } @@ -111,8 +98,7 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { /// executed on the default stream of the execution_space associted with x. template void swap(const XVector& x, const YVector& y) { - const typename XVector::execution_space space = - typename XVector::execution_space(); + const typename XVector::execution_space space = typename XVector::execution_space(); swap(space, x, y); } diff --git a/blas/src/KokkosBlas1_team_abs.hpp b/blas/src/KokkosBlas1_team_abs.hpp index 55dcc668db..a7e808c713 100644 --- a/blas/src/KokkosBlas1_team_abs.hpp +++ b/blas/src/KokkosBlas1_team_abs.hpp @@ -23,8 +23,7 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION abs(const TeamType& team, const RVector& r, - const XVector& x) { +void KOKKOS_INLINE_FUNCTION abs(const TeamType& team, const RVector& r, const XVector& x) { Impl::TeamAbs::team_abs(team, r, x); } diff --git a/blas/src/KokkosBlas1_team_axpby.hpp b/blas/src/KokkosBlas1_team_axpby.hpp index 374bc42390..1b8734a852 100644 --- a/blas/src/KokkosBlas1_team_axpby.hpp +++ b/blas/src/KokkosBlas1_team_axpby.hpp @@ -23,21 +23,16 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -axpby(const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const typename YVector::non_const_value_type& b, - const YVector& y) { - return Impl::TeamAXPBY::team_axpby(team, a, x, b, - y); +void KOKKOS_INLINE_FUNCTION axpby(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const typename YVector::non_const_value_type& b, const YVector& y) { + return Impl::TeamAXPBY::team_axpby(team, a, x, b, y); } template -void KOKKOS_INLINE_FUNCTION -axpy(const TeamType& team, const typename XVector::non_const_value_type& a, - const XVector& x, const YVector& y) { +void KOKKOS_INLINE_FUNCTION axpy(const TeamType& team, const typename XVector::non_const_value_type& a, + const XVector& x, const YVector& y) { KokkosBlas::Experimental::axpby( - team, a, x, - Kokkos::ArithTraits::one(), y); + team, a, x, Kokkos::ArithTraits::one(), y); } } // namespace Experimental diff --git a/blas/src/KokkosBlas1_team_dot.hpp b/blas/src/KokkosBlas1_team_dot.hpp index 25c5c05cfc..53065b6fae 100644 --- a/blas/src/KokkosBlas1_team_dot.hpp +++ b/blas/src/KokkosBlas1_team_dot.hpp @@ -23,9 +23,9 @@ namespace KokkosBlas { namespace Experimental { template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::dot_type KOKKOS_INLINE_FUNCTION -dot(const TeamType& team, const XVector& x, const YVector& y) { +typename Kokkos::Details::InnerProductSpaceTraits::dot_type + KOKKOS_INLINE_FUNCTION + dot(const TeamType& team, const XVector& x, const YVector& y) { return Impl::TeamDot::team_dot(team, x, y); } diff --git a/blas/src/KokkosBlas1_team_mult.hpp b/blas/src/KokkosBlas1_team_mult.hpp index 2737f835c0..08d9c6813e 100644 --- a/blas/src/KokkosBlas1_team_mult.hpp +++ b/blas/src/KokkosBlas1_team_mult.hpp @@ -23,12 +23,10 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -mult(const TeamType& team, const typename YVector::non_const_value_type& gamma, - const YVector& y, const typename AVector::non_const_value_type& alpha, - const AVector& a, const XVector& x) { - return Impl::TeamMult::team_mult( - team, gamma, y, alpha, a, x); +void KOKKOS_INLINE_FUNCTION mult(const TeamType& team, const typename YVector::non_const_value_type& gamma, + const YVector& y, const typename AVector::non_const_value_type& alpha, + const AVector& a, const XVector& x) { + return Impl::TeamMult::team_mult(team, gamma, y, alpha, a, x); } } // namespace Experimental diff --git a/blas/src/KokkosBlas1_team_nrm2.hpp b/blas/src/KokkosBlas1_team_nrm2.hpp index ee58cd3331..f0ac33f4f2 100644 --- a/blas/src/KokkosBlas1_team_nrm2.hpp +++ b/blas/src/KokkosBlas1_team_nrm2.hpp @@ -23,9 +23,9 @@ namespace KokkosBlas { namespace Experimental { template -typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type KOKKOS_INLINE_FUNCTION -nrm2(const TeamType& team, const XVector& x) { +typename Kokkos::Details::InnerProductSpaceTraits::mag_type + KOKKOS_INLINE_FUNCTION + nrm2(const TeamType& team, const XVector& x) { return Impl::TeamNrm2::team_nrm2(team, x); } diff --git a/blas/src/KokkosBlas1_team_scal.hpp b/blas/src/KokkosBlas1_team_scal.hpp index b148e165f1..31d0c63b6d 100644 --- a/blas/src/KokkosBlas1_team_scal.hpp +++ b/blas/src/KokkosBlas1_team_scal.hpp @@ -23,9 +23,8 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -scal(const TeamType& team, const RVector& r, - const typename XVector::non_const_value_type& a, const XVector& x) { +void KOKKOS_INLINE_FUNCTION scal(const TeamType& team, const RVector& r, + const typename XVector::non_const_value_type& a, const XVector& x) { return Impl::TeamScal::team_scal(team, r, a, x); } diff --git a/blas/src/KokkosBlas1_team_update.hpp b/blas/src/KokkosBlas1_team_update.hpp index 069932b1e5..587c492c6e 100644 --- a/blas/src/KokkosBlas1_team_update.hpp +++ b/blas/src/KokkosBlas1_team_update.hpp @@ -23,13 +23,11 @@ namespace KokkosBlas { namespace Experimental { template -void KOKKOS_INLINE_FUNCTION -update(const TeamType& team, - const typename XVector::non_const_value_type& alpha, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y, - const typename ZVector::non_const_value_type& gamma, const ZVector& z) { - return Impl::TeamUpdate::team_update( - team, alpha, x, beta, y, gamma, z); +void KOKKOS_INLINE_FUNCTION update(const TeamType& team, const typename XVector::non_const_value_type& alpha, + const XVector& x, const typename YVector::non_const_value_type& beta, + const YVector& y, const typename ZVector::non_const_value_type& gamma, + const ZVector& z) { + return Impl::TeamUpdate::team_update(team, alpha, x, beta, y, gamma, z); } } // namespace Experimental diff --git a/blas/src/KokkosBlas1_update.hpp b/blas/src/KokkosBlas1_update.hpp index 889f9ede32..95d1a2d7e0 100644 --- a/blas/src/KokkosBlas1_update.hpp +++ b/blas/src/KokkosBlas1_update.hpp @@ -44,8 +44,7 @@ namespace KokkosBlas { /// \param gamma [in] scaling parameter for Z /// \param Z [in/out] view of type ZMV in which the results will be stored. template -void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, const XMV& X, +void update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { static_assert(Kokkos::is_execution_space_v, @@ -60,20 +59,13 @@ void update(const execution_space& space, static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "Z is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: XMV must be accessible from execution_space."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: YMV must be accessible from execution_space."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: ZMV must be accessible from execution_space."); - static_assert(std::is_same::value, + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: XMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: YMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: ZMV must be accessible from execution_space."); + static_assert(std::is_same::value, "KokkosBlas::update: Z is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); @@ -88,37 +80,32 @@ void update(const execution_space& space, "XMV, YMV, and ZMV must either have rank 1 or rank 2."); // Check compatibility of dimensions at run time. - if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1) || - X.extent(0) != Z.extent(0) || X.extent(1) != Z.extent(1)) { + if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1) || X.extent(0) != Z.extent(0) || + X.extent(1) != Z.extent(1)) { std::ostringstream os; os << "KokkosBlas::update (MV): Dimensions of X, Y, and Z do not match: " - << "Z: " << Z.extent(0) << " x " << Z.extent(1) << ", X: " << X.extent(0) - << " x " << X.extent(1) << ", Y: " << Y.extent(0) << " x " - << Y.extent(1); + << "Z: " << Z.extent(0) << " x " << Z.extent(1) << ", X: " << X.extent(0) << " x " << X.extent(1) + << ", Y: " << Y.extent(0) << " x " << Y.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create unmanaged versions of the input Views. XMV, YMV, and ZMV // may be rank 1 or rank 2, but they must all have the same rank. - using XMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits >; + using XMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XMV::device_type, Kokkos::MemoryTraits >; - using YMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YMV::device_type, Kokkos::MemoryTraits >; + using YMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YMV::device_type, Kokkos::MemoryTraits >; - using ZMV_Internal = Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename ZMV::device_type, Kokkos::MemoryTraits >; + using ZMV_Internal = Kokkos::View::type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename ZMV::device_type, Kokkos::MemoryTraits >; XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; @@ -134,9 +121,8 @@ void update(const execution_space& space, << endl; #endif // KOKKOSKERNELS_PRINT_DEMANGLED_TYPE_INFO - Impl::Update::update(space, alpha, X_internal, beta, Y_internal, - gamma, Z_internal); + Impl::Update::update(space, alpha, X_internal, beta, + Y_internal, gamma, Z_internal); } /// \brief Compute Z := alpha*X + beta*Y + gamma*Z. diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 88ffc63810..22d2b7bbbf 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -49,56 +49,39 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View -template -void gemv(const ExecutionSpace& space, const char trans[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const XViewType& x, typename YViewType::const_value_type& beta, - const YViewType& y) { +template +void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { static_assert(Kokkos::is_execution_space_v, "KokkosBlas::gemv: ExecutionSpace must be a valid Kokkos " "execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemv: AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemv: XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemv: YViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "KokkosBlas::gemv: AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "KokkosBlas::gemv: XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "KokkosBlas::gemv: YViewType must have rank 1."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: AViewType must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: XViewType must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: YViewType must be accessible from ExecutionSpace"); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: YViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "KokkosBlas::gemv: AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "KokkosBlas::gemv: XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "KokkosBlas::gemv: YViewType must have rank 1."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: AViewType must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: XViewType must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: YViewType must be accessible from ExecutionSpace"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { if (A.extent(0) != y.extent(0) || A.extent(1) != x.extent(0)) { std::ostringstream os; os << "KokkosBlas::gemv: Dimensions of A, x, and y do not match: " - << "A: " << A.extent(0) << " x " << A.extent(1) - << ", x: " << x.extent(0) << ", y: " << y.extent(0); + << "A: " << A.extent(0) << " x " << A.extent(1) << ", x: " << x.extent(0) << ", y: " << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - } else if (trans[0] == 'T' || trans[0] == 't' || trans[0] == 'C' || - trans[0] == 'c' || trans[0] == 'H' || trans[0] == 'h') { + } else if (trans[0] == 'T' || trans[0] == 't' || trans[0] == 'C' || trans[0] == 'c' || trans[0] == 'H' || + trans[0] == 'h') { if (A.extent(1) != y.extent(0) || A.extent(0) != x.extent(0)) { std::ostringstream os; os << "KokkosBlas::dot: Dimensions of A, x, and y do not match: " - << "A: " << A.extent(0) << " x " << A.extent(1) - << ", x: " << x.extent(0) << ", y: " << y.extent(0); + << "A: " << A.extent(0) << " x " << A.extent(1) << ", x: " << x.extent(0) << ", y: " << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } } else { @@ -115,21 +98,16 @@ void gemv(const ExecutionSpace& space, const char trans[], // Minimize the number of Impl::GEMV instantiations, by // standardizing on particular View specializations for its template // parameters. - typedef Kokkos::View > AVT; typedef Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits > XVT; typedef Kokkos::View::array_layout, - typename YViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename YViewType::device_type, Kokkos::MemoryTraits > YVT; // Degenerate case is essentially same as scal - use fallback impl @@ -139,43 +117,32 @@ void gemv(const ExecutionSpace& space, const char trans[], // If A is LayoutRight and we have the BLAS, cuBLAS or rocBLAS TPL, use // fallback because those only support LayoutLeft #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS - useFallback = useFallback || (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback = useFallback || + (tolower(*trans) == 'c' && std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS - useFallback = - useFallback || - (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback = useFallback || + (tolower(*trans) == 'c' && std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - useFallback = useFallback || (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback = useFallback || + (tolower(*trans) == 'c' && std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #ifdef KOKKOS_ENABLE_SYCL // oneMKL supports both row-major and column-major of A // but only supports oneapi::mkl::transpose::nontrans op useFallback = - useFallback || !std::is_same_v; + useFallback || !std::is_same_v; #endif #endif if (useFallback) { - const bool eti_spec_avail = - KokkosBlas::Impl::gemv_eti_spec_avail::value; - typedef Impl::GEMV - fallback_impl_type; + const bool eti_spec_avail = KokkosBlas::Impl::gemv_eti_spec_avail::value; + typedef Impl::GEMV fallback_impl_type; fallback_impl_type::gemv(space, trans, alpha, A, x, beta, y); } else { typedef Impl::GEMV impl_type; @@ -200,8 +167,7 @@ void gemv(const ExecutionSpace& space, const char trans[], /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View template -void gemv(const char trans[], typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, +void gemv(const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { gemv(typename AViewType::execution_space{}, trans, alpha, A, x, beta, y); } @@ -212,46 +178,38 @@ namespace Experimental { /// template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& member, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y); + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y); }; template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& /*member*/, const char trans, - const ScalarType& alpha, const MatrixType& A, const XVector& x, - const ScalarType& beta, const YVector& y) { + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& /*member*/, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { serial_gemv(trans, alpha, A, x, beta, y); } }; template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& member, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y) { + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { team_gemv(member, trans, alpha, A, x, beta, y); } }; template struct Gemv { - template - static void KOKKOS_INLINE_FUNCTION - invoke(const MemberType& member, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y) { + template + static void KOKKOS_INLINE_FUNCTION invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { teamvector_gemv(member, trans, alpha, A, x, beta, y); } }; diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp index 8650577faf..88786649ba 100644 --- a/blas/src/KokkosBlas2_ger.hpp +++ b/blas/src/KokkosBlas2_ger.hpp @@ -39,54 +39,38 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View -template -void ger(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "YViewType memory space must be accessible from ExecutionSpace"); - - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); +template +void ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); // Check compatibility of dimensions at run time. if ((A.extent(0) != x.extent(0)) || (A.extent(1) != y.extent(0))) { std::ostringstream os; os << "KokkosBlas::ger: Dimensions of A, x, and y do not match: " - << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " - << x.extent(0) << ", y has size " << y.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " << x.extent(0) << ", y has size " + << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || - (trans[0] == 'h')) { + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || (trans[0] == 'h')) { // Ok } else { std::ostringstream os; - os << "KokkosBlas::ger: invalid trans[0] = '" << trans[0] - << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + os << "KokkosBlas::ger: invalid trans[0] = '" << trans[0] << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -99,21 +83,16 @@ void ger(const ExecutionSpace& space, const char trans[], // Minimize the number of Impl::GER instantiations, by standardizing // on particular View specializations for its template parameters. typedef Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits > XVT; typedef Kokkos::View::array_layout, - typename YViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename YViewType::device_type, Kokkos::MemoryTraits > YVT; - typedef Kokkos::View > AVT; @@ -133,12 +112,10 @@ void ger(const ExecutionSpace& space, const char trans[], /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void ger(const char trans[], const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - ger( - space, trans, alpha, x, y, A); +void ger(const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) { + const typename AViewType::execution_space space = typename AViewType::execution_space(); + ger(space, trans, alpha, x, y, A); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas2_serial_gemv.hpp b/blas/src/KokkosBlas2_serial_gemv.hpp index 12dbf61c3a..2b52d6c5a9 100644 --- a/blas/src/KokkosBlas2_serial_gemv.hpp +++ b/blas/src/KokkosBlas2_serial_gemv.hpp @@ -23,13 +23,9 @@ namespace KokkosBlas { namespace Experimental { -template -void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, const ScalarType& alpha, const MatrixType& A, + const XVector& x, const ScalarType& beta, const YVector& y) { if (trans == 'N' || trans == 'n') { using mode = KokkosBlas::Trans::NoTranspose; KokkosBlas::SerialGemv::invoke(alpha, A, x, beta, y); @@ -46,11 +42,8 @@ void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, // default AlgoTag template -void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, const ScalarType& alpha, const MatrixType& A, + const XVector& x, const ScalarType& beta, const YVector& y) { serial_gemv(trans, alpha, A, x, beta, y); } diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp index 00d1d8b3de..7cb226fd7f 100644 --- a/blas/src/KokkosBlas2_syr.hpp +++ b/blas/src/KokkosBlas2_syr.hpp @@ -64,53 +64,39 @@ namespace KokkosBlas { /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template void syr(const ExecutionSpace& space, const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, - const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be accessible from ExecutionSpace"); - - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); + const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); // Check compatibility of dimensions at run time. if ((A.extent(0) != x.extent(0)) || (A.extent(1) != x.extent(0))) { std::ostringstream os; os << "KokkosBlas::syr: Dimensions of A, x: " - << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " - << x.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " << x.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || - (trans[0] == 'h')) { + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || (trans[0] == 'h')) { // Ok } else { std::ostringstream os; - os << "KokkosBlas2::syr(): invalid trans[0] = '" << trans[0] - << "'. It must be equal to 'T' or 't' or 'H' or 'h'"; + os << "KokkosBlas2::syr(): invalid trans[0] = '" << trans[0] << "'. It must be equal to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l')) { + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l')) { // Ok } else { std::ostringstream oss; - oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] - << "'. It must be equal to 'U' or 'u' or 'L' or 'l'"; + oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] << "'. It must be equal to 'U' or 'u' or 'L' or 'l'"; throw std::runtime_error(oss.str()); } @@ -122,15 +108,11 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], // Minimize the number of Impl::SYR instantiations, by standardizing // on particular View specializations for its template parameters. - using XVT = - Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits >; - - using AVT = Kokkos::View::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits >; + + using AVT = Kokkos::View >; Impl::SYR::syr(space, trans, uplo, alpha, x, A); @@ -172,13 +154,10 @@ void syr(const ExecutionSpace& space, const char trans[], const char uplo[], /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void syr(const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, +void syr(const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - syr( - space, trans, uplo, alpha, x, A); + const typename AViewType::execution_space space = typename AViewType::execution_space(); + syr(space, trans, uplo, alpha, x, A); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas2_syr2.hpp b/blas/src/KokkosBlas2_syr2.hpp index d86abd31c1..91f4b20dee 100644 --- a/blas/src/KokkosBlas2_syr2.hpp +++ b/blas/src/KokkosBlas2_syr2.hpp @@ -78,67 +78,49 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View -template +template void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, - const YViewType& y, const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "YViewType memory space must be accessible from ExecutionSpace"); - - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); - - static_assert(static_cast(AViewType::rank()) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(XViewType::rank()) == 1, - "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank()) == 1, - "YViewType must have rank 1."); + const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank()) == 2, "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank()) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank()) == 1, "YViewType must have rank 1."); // Check compatibility of dimensions at run time. - if ((A.extent(0) == A.extent(1)) && (A.extent(0) == x.extent(0)) && - (A.extent(0) == y.extent(0))) { + if ((A.extent(0) == A.extent(1)) && (A.extent(0) == x.extent(0)) && (A.extent(0) == y.extent(0))) { // Ok } else { std::ostringstream os; os << "KokkosBlas::syr2: Dimensions of A, x: " - << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " - << x.extent(0) << ", y has size " << y.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " << x.extent(0) << ", y has size " + << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || - (trans[0] == 'h')) { + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || (trans[0] == 'h')) { // Ok } else { std::ostringstream os; - os << "KokkosBlas2::syr2(): invalid trans[0] = '" << trans[0] - << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + os << "KokkosBlas2::syr2(): invalid trans[0] = '" << trans[0] << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l')) { + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l')) { // Ok } else { std::ostringstream oss; - oss << "KokkosBlas2::syr2(): invalid uplo[0] = " << uplo[0] - << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; + oss << "KokkosBlas2::syr2(): invalid uplo[0] = " << uplo[0] << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; throw std::runtime_error(oss.str()); } @@ -151,26 +133,20 @@ void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], // Minimize the number of Impl::SYR2 instantiations, by standardizing // on particular View specializations for its template parameters. typedef Kokkos::View::array_layout, - typename XViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename XViewType::device_type, Kokkos::MemoryTraits > XVT; typedef Kokkos::View::array_layout, - typename YViewType::device_type, - Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename YViewType::device_type, Kokkos::MemoryTraits > YVT; - typedef Kokkos::View > AVT; - Impl::SYR2::syr2(space, trans, uplo, alpha, x, - y, A); + Impl::SYR2::syr2(space, trans, uplo, alpha, x, y, A); } /// \brief Rank-1 update (just lower portion or just upper portion) of a @@ -224,13 +200,10 @@ void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void syr2(const char trans[], const char uplo[], - const typename AViewType::const_value_type& alpha, const XViewType& x, +void syr2(const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - syr2( - space, trans, uplo, alpha, x, y, A); + const typename AViewType::execution_space space = typename AViewType::execution_space(); + syr2(space, trans, uplo, alpha, x, y, A); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas2_team_gemv.hpp b/blas/src/KokkosBlas2_team_gemv.hpp index 09a1ae2330..a4a6dade2d 100644 --- a/blas/src/KokkosBlas2_team_gemv.hpp +++ b/blas/src/KokkosBlas2_team_gemv.hpp @@ -22,67 +22,48 @@ namespace KokkosBlas { namespace Experimental { -template -void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, const YVector& y) { if (trans == 'N' || trans == 'n') - TeamGemv::invoke(team, alpha, A, x, - beta, y); + TeamGemv::invoke(team, alpha, A, x, beta, y); else if (trans == 'T' || trans == 't') - TeamGemv::invoke(team, alpha, A, x, - beta, y); + TeamGemv::invoke(team, alpha, A, x, beta, y); else if (trans == 'C' || trans == 'c') - TeamGemv::invoke(team, alpha, A, x, - beta, y); + TeamGemv::invoke(team, alpha, A, x, beta, y); else { Kokkos::abort("Matrix mode not supported"); } } // default AlgoTag -template -void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, - const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, const YVector& y) { team_gemv(team, trans, alpha, A, x, beta, y); } -template -void KOKKOS_INLINE_FUNCTION -teamvector_gemv(const TeamType& team, const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, const ScalarType& beta, - const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION teamvector_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { if (trans == 'N' || trans == 'n') { - KokkosBlas::TeamVectorGemv::invoke( - team, alpha, A, x, beta, y); + KokkosBlas::TeamVectorGemv::invoke(team, alpha, A, x, beta, y); } else if (trans == 'T' || trans == 't') { - KokkosBlas::TeamVectorGemv::invoke( - team, alpha, A, x, beta, y); + KokkosBlas::TeamVectorGemv::invoke(team, alpha, A, x, beta, y); } else if (trans == 'C' || trans == 'c') { - KokkosBlas::TeamVectorGemv::invoke( - team, alpha, A, x, beta, y); + KokkosBlas::TeamVectorGemv::invoke(team, alpha, A, x, beta, y); } else { Kokkos::abort("Matrix mode not supported"); } } // default AlgoTag -template -void KOKKOS_INLINE_FUNCTION -team_vector_gemv(const TeamType& team, const char trans, - const ScalarType& alpha, const MatrixType& A, const XVector& x, - const ScalarType& beta, const YVector& y) { - teamvector_gemv(team, trans, alpha, A, x, - beta, y); +template +void KOKKOS_INLINE_FUNCTION team_vector_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { + teamvector_gemv(team, trans, alpha, A, x, beta, y); } } // namespace Experimental diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index febd39b149..b0bff7ea71 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -38,31 +38,21 @@ namespace Impl { // This case must be intercepted here rather than impl in order to call TPL // GEMV instead of TPL GEMM. This codepath was measured to be profitable with // cuBLAS. -template +template bool gemv_based_gemm( - const execution_space& space, const char transA[], const char transB[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B, typename CViewType::const_value_type& beta, - const CViewType& C, - typename std::enable_if::value && - !std::is_same::value>::type* = + const execution_space& space, const char transA[], const char transB[], typename AViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C, + typename std::enable_if::value && + !std::is_same::value>::type* = nullptr) { - if (toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' && - B.extent(1) == size_t(1)) { + if (toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' && B.extent(1) == size_t(1)) { // since B/C both have a single column and are not LayoutStride, // can create a raw contiguous rank-1 vector from them rather than using // subview. - Kokkos::View> Bvec(B.data(), B.extent(0)); - Kokkos::View> Cvec(C.data(), C.extent(0)); KokkosBlas::gemv(space, "N", alpha, A, Bvec, beta, Cvec); @@ -76,15 +66,11 @@ bool gemv_based_gemm( // tests. template bool gemv_based_gemm( - const typename CViewType::execution_space& /*space*/, - const char /*transA*/[], const char /*transB*/[], - typename AViewType::const_value_type& /*alpha*/, const AViewType& /*A*/, - const BViewType& /*B*/, typename CViewType::const_value_type& /*beta*/, - const CViewType& /*C*/, - typename std::enable_if::value || - std::is_same::value>::type* = + const typename CViewType::execution_space& /*space*/, const char /*transA*/[], const char /*transB*/[], + typename AViewType::const_value_type& /*alpha*/, const AViewType& /*A*/, const BViewType& /*B*/, + typename CViewType::const_value_type& /*beta*/, const CViewType& /*C*/, + typename std::enable_if::value || + std::is_same::value>::type* = nullptr) { return false; } @@ -108,52 +94,35 @@ bool gemv_based_gemm( /// \param B [in] Input matrix, as a 2-D Kokkos::View /// \param beta [in] Input coefficient of C /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View -template -void gemm(const execution_space& space, const char transA[], - const char transB[], typename AViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B, +template +void gemm(const execution_space& space, const char transA[], const char transB[], + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_execution_space_v, "KokkosBlas::gemm: execution_space must be a valid Kokkos " "execution space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemm: AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemm: BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gemm: CViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "KokkosBlas::gemm: AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "KokkosBlas::gemm: BViewType must have rank 2."); - static_assert(static_cast(CViewType::rank) == 2, - "KokkosBlas::gemm: CViewType must have rank 2."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: AViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: BViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: CViewType must be accessible from execution_space"); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemm: AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemm: BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosBlas::gemm: CViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "KokkosBlas::gemm: AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "KokkosBlas::gemm: BViewType must have rank 2."); + static_assert(static_cast(CViewType::rank) == 2, "KokkosBlas::gemm: CViewType must have rank 2."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: AViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: BViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: CViewType must be accessible from execution_space"); // Check validity of transpose argument - bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || - (transA[0] == 'T') || (transA[0] == 't') || + bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || (transA[0] == 'T') || (transA[0] == 't') || (transA[0] == 'C') || (transA[0] == 'c'); - bool valid_transB = (transB[0] == 'N') || (transB[0] == 'n') || - (transB[0] == 'T') || (transB[0] == 't') || + bool valid_transB = (transB[0] == 'N') || (transB[0] == 'n') || (transB[0] == 'T') || (transB[0] == 't') || (transB[0] == 'C') || (transB[0] == 'c'); if (!(valid_transA && valid_transB)) { std::ostringstream os; - os << "KokkosBlas::gemm: transA[0] = '" << transA[0] << " transB[0] = '" - << transB[0] << "'. " + os << "KokkosBlas::gemm: transA[0] = '" << transA[0] << " transB[0] = '" << transB[0] << "'. " << "Valid values include 'N' or 'n' (No transpose), 'T' or 't' " "(Transpose), " "and 'C' or 'c' (Conjugate transpose)."; @@ -172,13 +141,11 @@ void gemm(const execution_space& space, const char transA[], int64_t C0 = C.extent(0); int64_t C1 = C.extent(1); - if (((A_t ? A1 : A0) != C0) || ((B_t ? B_0 : B1) != C1) || - ((A_t ? A0 : A1) != (B_t ? B1 : B_0))) { + if (((A_t ? A1 : A0) != C0) || ((B_t ? B_0 : B1) != C1) || ((A_t ? A0 : A1) != (B_t ? B1 : B_0))) { std::ostringstream os; os << "KokkosBlas::gemm: Dimensions of A, B, and C do not match: " - << "transA: " << transA[0] << " transB: " << transB[0] - << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) - << " x " << B.extent(1) << " C: " << C.extent(0) << " x " << C.extent(1); + << "transA: " << transA[0] << " transB: " << transB[0] << " A: " << A.extent(0) << " x " << A.extent(1) + << " B: " << B.extent(0) << " x " << B.extent(1) << " C: " << C.extent(0) << " x " << C.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } #endif // KOKKOSKERNELS_DEBUG_LEVEL > 0 @@ -195,24 +162,19 @@ void gemm(const execution_space& space, const char transA[], } // Check if gemv code path is allowed and profitable, and if so run it. - if (Impl::gemv_based_gemm(space, transA, transB, alpha, A, B, beta, C)) - return; + if (Impl::gemv_based_gemm(space, transA, transB, alpha, A, B, beta, C)) return; // Minimize the number of Impl::GEMM instantiations, by // standardizing on particular View specializations for its template // parameters. - typedef Kokkos::View< - typename AViewType::const_value_type**, typename AViewType::array_layout, - typename AViewType::device_type, Kokkos::MemoryTraits> + typedef Kokkos::View> AVT; - typedef Kokkos::View< - typename BViewType::const_value_type**, typename BViewType::array_layout, - typename BViewType::device_type, Kokkos::MemoryTraits> + typedef Kokkos::View> BVT; - typedef Kokkos::View> + typedef Kokkos::View> CVT; typedef Impl::GEMM impl_type; impl_type::gemm(space, transA, transB, alpha, A, B, beta, C); @@ -236,12 +198,9 @@ void gemm(const execution_space& space, const char transA[], /// \param beta [in] Input coefficient of C /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View template -void gemm(const char transA[], const char transB[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B, typename CViewType::const_value_type& beta, - const CViewType& C) { - gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, - C); +void gemm(const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { + gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, C); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trmm.hpp b/blas/src/KokkosBlas3_trmm.hpp index bdc86d4d9e..9da47b7160 100644 --- a/blas/src/KokkosBlas3_trmm.hpp +++ b/blas/src/KokkosBlas3_trmm.hpp @@ -64,29 +64,19 @@ namespace KokkosBlas { /// On entry, M-by-N matrix /// On exit, overwritten with the solution template -void trmm(const execution_space& space, const char side[], const char uplo[], - const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); +void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); // Check validity of indicator argument - bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || - (side[0] == 'r'); - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || - (trans[0] == 'T') || (trans[0] == 't') || + bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || (side[0] == 'r'); + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l'); + bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || (trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'C') || (trans[0] == 'c'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || (diag[0] == 'n'); if (!valid_side) { std::ostringstream os; os << "KokkosBlas::trmm: side = '" << side[0] << "'. " @@ -133,27 +123,20 @@ void trmm(const execution_space& space, const char side[], const char uplo[], if (A_m != A_n || (is_A_lower_triangle ? B_m : B_n) != A_n) { std::ostringstream os; os << "KokkosBlas::trmm: Dimensions of A and B do not match: " - << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) - << " B: " << B.extent(0) << " x " << B.extent(1); + << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) << " x " + << B.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Create A matrix view type alias - using AViewInternalType = - Kokkos::View >; + using AViewInternalType = Kokkos::View >; // Crease B matrix view type alias - using BViewInternalType = - Kokkos::View >; + using BViewInternalType = Kokkos::View >; - KokkosBlas::Impl::TRMM::trmm(space, side, uplo, trans, - diag, alpha, A, B); + KokkosBlas::Impl::TRMM::trmm(space, side, uplo, trans, diag, + alpha, A, B); } /// \brief Solve triangular linear system with multiple RHSs: @@ -186,11 +169,9 @@ void trmm(const execution_space& space, const char side[], const char uplo[], /// On entry, M-by-N matrix /// On exit, overwritten with the solution template -void trmm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { - trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, - B); +void trmm(const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 890b2ff6aa..fd0123174e 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -60,29 +60,19 @@ namespace KokkosBlas { /// On entry, M-by-N matrix of multile RHS /// On exit, overwritten with the solution X template -void trsm(const execution_space& space, const char side[], const char uplo[], - const char trans[], const char diag[], - typename BViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); +void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "BViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + static_assert(static_cast(BViewType::rank) == 2, "BViewType must have rank 2."); // Check validity of indicator argument - bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || - (side[0] == 'r'); - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || - (trans[0] == 'T') || (trans[0] == 't') || + bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') || (side[0] == 'r'); + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l'); + bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') || (trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'C') || (trans[0] == 'c'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || (diag[0] == 'n'); if (!valid_side) { std::ostringstream os; os << "KokkosBlas::trsm: side = '" << side[0] << "'. " @@ -125,30 +115,23 @@ void trsm(const execution_space& space, const char side[], const char uplo[], if ((A0 != A1) || ((A_s ? B0 : B1) != A1)) { std::ostringstream os; os << "KokkosBlas::trsm: Dimensions of A and B do not match: " - << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) - << " B: " << B.extent(0) << " x " << B.extent(1); + << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) << " x " + << B.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Return if degenerated matrices are provided - if ((A.extent(0) == 0) || (A.extent(1) == 0) || (B.extent(0) == 0) || - (B.extent(1) == 0)) - return; + if ((A.extent(0) == 0) || (A.extent(1) == 0) || (B.extent(0) == 0) || (B.extent(1) == 0)) return; // Minimize the number of Impl::TRSM instantiations, by // standardizing on particular View specializations for its template // parameters. - using AVT = Kokkos::View >; - using BVT = Kokkos::View >; + using AVT = Kokkos::View >; + using BVT = Kokkos::View >; - KokkosBlas::Impl::TRSM::trsm( - space, side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRSM::trsm(space, side, uplo, trans, diag, alpha, A, B); } /// \brief Solve triangular linear system with multiple RHSs: @@ -179,11 +162,9 @@ void trsm(const execution_space& space, const char side[], const char uplo[], /// On entry, M-by-N matrix of multile RHS /// On exit, overwritten with the solution X template -void trsm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { - trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, - B); +void trsm(const char side[], const char uplo[], const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { + trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp index d9771e3a16..34ca96b2d4 100644 --- a/blas/src/KokkosBlas_trtri.hpp +++ b/blas/src/KokkosBlas_trtri.hpp @@ -43,8 +43,7 @@ namespace KokkosBlas { // and the inversion could not be completed. // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri template -[[deprecated]] int trtri(const char uplo[], const char diag[], - const AViewType& A) { +[[deprecated]] int trtri(const char uplo[], const char diag[], const AViewType& A) { return KokkosLapack::trtri(uplo, diag, A); } diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index e2b04e300d..1ed52d35b8 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_tpl_spec_avail { enum : bool { value = false }; }; @@ -34,54 +33,44 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct axpby_tpl_spec_avail< \ - ExecSpace, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct axpby_tpl_spec_avail< \ + ExecSpace, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct axpby_tpl_spec_avail< \ - ExecSpace, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct axpby_tpl_spec_avail< \ + ExecSpace, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 65154b9985..5ab29e632f 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -27,8 +27,7 @@ inline void axpby_print_specialization() { printf( "KokkosBlas1::axpby<> TPL Blas specialization for < %s , %s , %s , %s " ">\n", - typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), - typeid(YMV).name()); + typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(), typeid(YMV).name()); #endif } } // namespace @@ -40,158 +39,132 @@ inline void axpby_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef double AV; \ - typedef double BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, double, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + double, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef double AV; \ + typedef double BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef float AV; \ - typedef float BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, float, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + float, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef float AV; \ + typedef float BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy(N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy(N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) @@ -222,186 +195,152 @@ KOKKOSBLAS1_CAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - double, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef double AV; \ - typedef double BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, double, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + double, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef double AV; \ + typedef double BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - float, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef float AV; \ - typedef float BV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby< \ + ExecSpace, float, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + float, \ + Kokkos::View, Kokkos::MemoryTraits >, 1, \ + true, ETI_SPEC_AVAIL> { \ + typedef float AV; \ + typedef float BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ - const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 13cc2a6f92..8d5f1b939b 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct dot_tpl_spec_avail { enum : bool { value = false }; }; @@ -34,24 +33,20 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct dot_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct dot_tpl_spec_avail >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) // TODO: we met difficuties in FindTPLMKL.cmake to set the BLAS library properly // such that the test in CheckHostBlasReturnComplex.cmake could not be @@ -59,33 +54,28 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, // This resulted in segfault in dot() with MKL and complex. // So we just temporarily disable it until FindTPLMKL.cmake is fixed. #if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif #endif -#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct dot_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct dot_tpl_spec_avail >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ - MEMSPACE) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) \ KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS @@ -100,13 +90,11 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #endif } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 247957b2c8..fa9d5fafce 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -24,8 +24,8 @@ namespace { template inline void dot_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::dot<> TPL Blas specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(XV).name(), typeid(YV).name()); + printf("KokkosBlas1::dot<> TPL Blas specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(XV).name(), + typeid(YV).name()); #endif } } // namespace @@ -39,59 +39,44 @@ inline void dot_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::dot( \ - N, reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + R() = HostBlas::dot(N, reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one); \ + } else { \ + Dot::dot(space, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, float, float, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, double, double, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) @@ -108,69 +93,51 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_DOT, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ - 1, reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + dot_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Dot::dot(space, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasSdot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasDdot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ - Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ - Kokkos::Cuda, Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, Kokkos::Cuda, Kokkos::CudaSpace, cublasSdot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) @@ -185,68 +152,50 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_DOT, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - dot_print_specialization(); \ - const rocblas_int N = static_cast(numElems); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ - 1, reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + dot_print_specialization(); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Dot::dot(space, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_sdot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_ddot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_cdotc, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_zdotc, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, Kokkos::HIP, Kokkos::HIPSpace, rocblas_sdot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_ddot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_cdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_zdotc, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) @@ -262,67 +211,50 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_DOT, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - dot_print_specialization(); \ - const std::int64_t N = static_cast(numElems); \ - TPL_DOT(exec.sycl_queue(), N, \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R())); \ - } else { \ - Dot::dot(exec, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + dot_print_specialization(); \ + const std::int64_t N = static_cast(numElems); \ + TPL_DOT(exec.sycl_queue(), N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, reinterpret_cast(&R())); \ + } else { \ + Dot::dot(exec, R, X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::dot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::dot, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(true) KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(false) diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 616c26c87a..36a5e5171f 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -33,145 +33,96 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) // double -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template \ - struct iamax_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct iamax_tpl_spec_avail< \ + ExecSpace, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // double -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::Cuda, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template <> \ - struct iamax_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::HIP, Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, double, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, double, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, float, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, float, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp index 913ec5a151..c85de4d186 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp @@ -23,15 +23,12 @@ template inline void iamax_print_specialization() { #if defined(KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION) #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) - printf("KokkosBlas1::iamax<> TPL cuBLAS specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::iamax<> TPL cuBLAS specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #elif defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) - printf("KokkosBlas1::iamax<> TPL rocBLAS specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::iamax<> TPL rocBLAS specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #else #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - printf("KokkosBlas1::iamax<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::iamax<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif #endif #endif @@ -46,90 +43,63 @@ inline void iamax_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS( \ - SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - int idx = HostBlas::iamax( \ - N, reinterpret_cast(X.data()), LDX); \ - R() = static_cast(idx); \ - } else { \ - Iamax::iamax(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + int idx = HostBlas::iamax(N, reinterpret_cast(X.data()), LDX); \ + R() = static_cast(idx); \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -145,227 +115,155 @@ namespace Impl { using CUBLAS_DEVICE_TYPE = Kokkos::Device; #if defined(KOKKOS_ENABLE_CUDA_UVM) -using CUBLASUVM_DEVICE_TYPE = - Kokkos::Device; +using CUBLASUVM_DEVICE_TYPE = Kokkos::Device; #endif -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, EXEC_SPACE, \ - MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, CUBLAS_PTR_MODE_1, \ - CUBLAS_PTR_MODE_2) \ - template <> \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const EXEC_SPACE& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::iamax[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - Kokkos::deep_copy(R, 0); \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - const int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t prevPtrMode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &prevPtrMode)); \ - if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, N, reinterpret_cast(X.data()), \ - LDX, reinterpret_cast(R.data()))); \ - if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } \ - } else { \ - Iamax::iamax(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + EXEC_SPACE, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ + CUBLAS_PTR_MODE_1, CUBLAS_PTR_MODE_2) \ + template <> \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const EXEC_SPACE& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + Kokkos::deep_copy(R, 0); \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + const int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t prevPtrMode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &prevPtrMode)); \ + if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(X.data()), LDX, \ + reinterpret_cast(R.data()))); \ + if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ - CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ - CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, \ - CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #if defined(KOKKOS_ENABLE_CUDA_UVM) -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ - CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ - Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, \ - CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #endif -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(double, double, cublasIdamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(float, float, cublasIsamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, \ - cublasIcamax, INDEX_TYPE, LAYOUT, \ +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(double, double, cublasIdamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(float, float, cublasIsamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, cublasIcamax, INDEX_TYPE, LAYOUT, \ MEMSPACE, ETI_SPEC_AVAIL) #if defined(KOKKOS_ENABLE_CUDA_UVM) -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(double, double, cublasIdamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(double, double, cublasIdamax, INDEX_TYPE, LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(float, float, cublasIsamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(float, float, cublasIsamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - Kokkos::complex, cuDoubleComplex, cublasIzamax, INDEX_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - Kokkos::complex, cuComplex, cublasIcamax, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(Kokkos::complex, cuComplex, cublasIcamax, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) #endif -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) #if defined(KOKKOS_ENABLE_CUDA_UVM) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) #endif } // namespace Impl @@ -382,144 +280,100 @@ namespace Impl { using ROCBLAS_DEVICE_TYPE = Kokkos::Device; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, ROCBLAS_PTR_MODE_1, \ - ROCBLAS_PTR_MODE_2) \ - template <> \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = Kokkos::HIP; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - Kokkos::deep_copy(R, 0); \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - const int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode prevPtrMode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - ROCBLAS_FN(s.handle, N, \ - reinterpret_cast(X.data()), \ - LDX, reinterpret_cast(R.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ - } \ - } else { \ - Iamax::iamax(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ + ROCBLAS_PTR_MODE_1, ROCBLAS_PTR_MODE_2) \ + template <> \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = Kokkos::HIP; \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + Kokkos::deep_copy(R, 0); \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + const int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode prevPtrMode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(X.data()), \ + LDX, reinterpret_cast(R.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ + } \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, rocblas_pointer_mode_host, \ - rocblas_pointer_mode_device) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL, ROCBLAS_DEVICE_TYPE, \ - rocblas_pointer_mode_device, rocblas_pointer_mode_host) - -#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_idamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_isamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_double_complex, rocblas_izamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_float_complex, rocblas_icamax, \ - INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::HIPSpace, false) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + rocblas_pointer_mode_host, rocblas_pointer_mode_device) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ + MEMSPACE, ETI_SPEC_AVAIL, ROCBLAS_DEVICE_TYPE, \ + rocblas_pointer_mode_device, rocblas_pointer_mode_host) + +#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_idamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_isamax, INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_double_complex, rocblas_izamax, \ + INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_float_complex, rocblas_icamax, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp index 8d3fc0f4d2..3924e0da21 100644 --- a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 8f79c8a58d..6de384380e 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -33,83 +33,62 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS // double -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS @@ -118,30 +97,23 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #if defined(KOKKOS_ENABLE_SYCL) -#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm1_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( - Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) #endif // KOKKOS_ENABLE_SYCL #endif // KOKKOSKERNELS_ENABLE_TPL_MKL diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 12a240db6b..378fbc936f 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void nrm1_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::nrm1<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::nrm1<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif } } // namespace @@ -39,87 +38,64 @@ inline void nrm1_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct Nrm1< \ - EXECSPACE, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using mag_type = typename Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View>; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const EXECSPACE& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS," #SCALAR "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - if constexpr (Kokkos::ArithTraits::is_complex) { \ - R() = HostBlas>::asum( \ - N, reinterpret_cast*>(X.data()), \ - one); \ - } else { \ - R() = HostBlas::asum(N, X.data(), one); \ - } \ - } else { \ - Nrm1::value>::nrm1(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct Nrm1::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using mag_type = typename Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const EXECSPACE& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + if constexpr (Kokkos::ArithTraits::is_complex) { \ + R() = HostBlas>::asum(N, reinterpret_cast*>(X.data()), \ + one); \ + } else { \ + R() = HostBlas::asum(N, X.data(), one); \ + } \ + } else { \ + Nrm1::value>::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_OPENMP) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_THREADS) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) #endif } // namespace Impl @@ -135,99 +111,74 @@ namespace KokkosBlas { namespace Impl { template -void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, - const XViewType& X) { +void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewType& X) { using XScalar = typename XViewType::non_const_value_type; nrm1_print_specialization(); - const int N = static_cast(X.extent(0)); - constexpr int one = 1; - KokkosBlas::Impl::CudaBlasSingleton& s = - KokkosBlas::Impl::CudaBlasSingleton::singleton(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( - cublasSasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( - cublasDasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { KOKKOS_CUBLAS_SAFE_CALL_IMPL( - cublasScasum(s.handle, N, reinterpret_cast(X.data()), - one, R.data())); + cublasScasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( - s.handle, N, reinterpret_cast(X.data()), one, - R.data())); + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasDzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); } -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct Nrm1< \ - Kokkos::Cuda, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using execution_space = Kokkos::Cuda; \ - using RV = Kokkos::View::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits>; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS," #SCALAR \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - cublasAsumWrapper(space, R, X); \ - } else { \ - Nrm1::value>::nrm1(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct Nrm1::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using execution_space = Kokkos::Cuda; \ + using RV = Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + cublasAsumWrapper(space, R, X); \ + } else { \ + Nrm1::value>::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -242,89 +193,67 @@ namespace KokkosBlas { namespace Impl { template -void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, - const XViewType& X) { +void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewType& X) { using XScalar = typename XViewType::non_const_value_type; nrm1_print_specialization(); - const int N = static_cast(X.extent(0)); - constexpr int one = 1; - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_sasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_dasum(s.handle, N, X.data(), one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( - s.handle, N, reinterpret_cast(X.data()), - one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_scasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( - s.handle, N, reinterpret_cast(X.data()), - one, R.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_dzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct Nrm1< \ - Kokkos::HIP, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using RV = Kokkos::View::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits>; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const Kokkos::HIP& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS," #SCALAR \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - rocblasAsumWrapper(space, R, X); \ - } else { \ - Nrm1::value>::nrm1(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct Nrm1::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using RV = Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const Kokkos::HIP& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + rocblasAsumWrapper(space, R, X); \ + } else { \ + Nrm1::value>::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosBlas @@ -343,8 +272,7 @@ namespace KokkosBlas { namespace Impl { template -void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, - const XViewType& X) { +void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewType& X) { using XScalar = typename XViewType::non_const_value_type; using KAT_X = Kokkos::ArithTraits; using layout_t = typename XViewType::array_layout; @@ -352,100 +280,75 @@ void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, const std::int64_t N = static_cast(X.extent(0)); // Create temp view on device to store the result - Kokkos::View::mag_type, - typename XViewType::memory_space> - res("sycl asum result"); + Kokkos::View::mag_type, typename XViewType::memory_space> res( + "sycl asum result"); // Decide to call row_major or column_major function if constexpr (std::is_same_v) { if constexpr (KAT_X::is_complex) { - oneapi::mkl::blas::row_major::asum( - space.sycl_queue(), N, - reinterpret_cast*>( - X.data()), - 1, res.data()); - } else { - oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, X.data(), 1, + oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, + reinterpret_cast*>(X.data()), 1, res.data()); + } else { + oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, X.data(), 1, res.data()); } } else { if constexpr (KAT_X::is_complex) { - oneapi::mkl::blas::column_major::asum( - space.sycl_queue(), N, - reinterpret_cast*>( - X.data()), - 1, res.data()); + oneapi::mkl::blas::column_major::asum(space.sycl_queue(), N, + reinterpret_cast*>(X.data()), + 1, res.data()); } else { - oneapi::mkl::blas::column_major::asum(space.sycl_queue(), X.extent_int(0), - X.data(), 1, res.data()); + oneapi::mkl::blas::column_major::asum(space.sycl_queue(), X.extent_int(0), X.data(), 1, res.data()); } } // Bring result back to host Kokkos::deep_copy(space, R, res); } -#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct Nrm1< \ - Kokkos::Experimental::SYCL, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, \ - nrm1_eti_spec_avail< \ - Kokkos::Experimental::SYCL, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using execution_space = Kokkos::Experimental::SYCL; \ - using RV = Kokkos::View::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits>; \ - using XV = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using size_type = typename XV::size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ONEMKL," #SCALAR \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - onemklAsumWrapper(space, R, X); \ - } else { \ - Nrm1::value>::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct Nrm1< \ + Kokkos::Experimental::SYCL, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using RV = Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ONEMKL," #SCALAR "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + onemklAsumWrapper(space, R, X); \ + } else { \ + Nrm1::value>::nrm1( \ + space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) -KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) -KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLSharedUSMSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 7bc55becc0..b7b70b5edb 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -32,60 +32,47 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrm2_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm2_tpl_spec_avail::mag_type, LAYOUT, \ + Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif -#define KOKKOSBLAS1_NRM2_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ - template <> \ - struct nrm2_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct nrm2_tpl_spec_avail::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ - MEMSPACE) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index ef45238405..b1e4cd58b9 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void nrm2_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::nrm2<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::nrm2<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif } } // namespace @@ -39,175 +38,131 @@ inline void nrm2_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas::nrm2(N, X.data(), int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = HostBlas::nrm2(N, X.data(), int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas::nrm2(N, X.data(), int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = HostBlas::nrm2(N, X.data(), int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas >::nrm2( \ - N, reinterpret_cast*>(X.data()), \ - int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = HostBlas >::nrm2(N, reinterpret_cast*>(X.data()), \ + int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - int N = numElems; \ - int int_one = 1; \ - R() = HostBlas >::nrm2( \ - N, reinterpret_cast*>(X.data()), \ - int_one); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Nrm2 >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + int N = numElems; \ + int int_one = 1; \ + R() = \ + HostBlas >::nrm2(N, reinterpret_cast*>(X.data()), int_one); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -220,66 +175,48 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ - 1, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasSnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::Cuda, Kokkos::CudaSpace, \ - cublasDnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ - Kokkos::CudaSpace, cublasScnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ - Kokkos::Cuda, Kokkos::CudaSpace, cublasDznrm2, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasScnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasDznrm2, ETI_SPEC_AVAIL) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(true) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(false) @@ -295,66 +232,48 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ROCBLAS," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const rocblas_int N = static_cast(numElems); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ - 1, &R())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ROCBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_snrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ - Kokkos::HIP, Kokkos::HIPSpace, \ - rocblas_dnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_scnrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ - Kokkos::HIP, Kokkos::HIPSpace, rocblas_dznrm2, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_snrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_dnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_scnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_dznrm2, ETI_SPEC_AVAIL) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(true) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) @@ -372,64 +291,49 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ - EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, \ - Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + \ - "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= \ - static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const std::int64_t N = static_cast(numElems); \ - TPL_NRM2(space.sycl_queue(), N, \ - reinterpret_cast(X.data()), 1, &R()); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, \ - take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ONEMKL," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const std::int64_t N = static_cast(numElems); \ + TPL_NRM2(space.sycl_queue(), N, reinterpret_cast(X.data()), 1, &R()); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ - Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ - Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ - Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ - oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::nrm2, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, oneapi::mkl::blas::row_major::nrm2, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(true) KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(false) diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 88591fbf0c..27647eed11 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -33,28 +33,21 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double -#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct nrminf_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrminf_tpl_spec_avail::mag_type, \ + LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index 17ec54e057..0b2081fc27 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void nrminf_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::nrminf<> TPL Blas specialization for < %s , %s >\n", - typeid(RV).name(), typeid(XV).name()); + printf("KokkosBlas1::nrminf<> TPL Blas specialization for < %s , %s >\n", typeid(RV).name(), typeid(XV).name()); #endif } } // namespace @@ -39,201 +38,152 @@ inline void nrminf_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf< \ - ExecSpace, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = HostBlas::iamax(N, X.data(), one) - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = HostBlas::iamax(N, X.data(), one) - 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf< \ - ExecSpace, \ - Kokkos::View>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0f; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = HostBlas::iamax(N, X.data(), one) - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0f; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = HostBlas::iamax(N, X.data(), one) - 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> \ - IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = \ - HostBlas>::iamax( \ - N, reinterpret_cast*>(X.data()), \ - one) - \ - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = \ + HostBlas>::iamax(N, reinterpret_cast*>(X.data()), one) - \ + 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> \ - IPT; \ - \ - static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0f; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = \ - HostBlas>::iamax( \ - N, reinterpret_cast*>(X.data()), \ - one) - \ - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(space, R, \ - X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0f; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = \ + HostBlas>::iamax(N, reinterpret_cast*>(X.data()), one) - 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp index 59f1715e54..fee65fce14 100644 --- a/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rot_tpl_spec_avail.hpp @@ -32,62 +32,46 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ - template <> \ - struct rot_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct rot_tpl_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #ifdef KOKKOS_ENABLE_OPENMP KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rot_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rot_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) #endif // rocBLAS diff --git a/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp index 8c83f9a096..404c5c0e3b 100644 --- a/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp @@ -24,9 +24,8 @@ namespace { template inline void rot_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas::rot<> TPL Blas specialization for < %s, %s, %s >\n", - typeid(VectorView).name(), typeid(ScalarView).name(), - typeid(ExecutionSpace).name); + printf("KokkosBlas::rot<> TPL Blas specialization for < %s, %s, %s >\n", typeid(VectorView).name(), + typeid(ScalarView).name(), typeid(ExecutionSpace).name); #endif } } // namespace @@ -40,110 +39,76 @@ inline void rot_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,double]"); \ - HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,double]"); \ + HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,float]"); \ - HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,float]"); \ + HostBlas::rot(X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Rot, EXECSPACE, MEMSPACE, true, \ - ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_BLAS,complex]"); \ - HostBlas>::rot( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1, c.data(), \ - s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Rot, EXECSPACE, MEMSPACE, true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,complex]"); \ + HostBlas>::rot(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Rot, EXECSPACE, MEMSPACE, true, \ - ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& /*space*/, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_BLAS,complex]"); \ - HostBlas>::rot( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1, c.data(), \ - s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Rot, EXECSPACE, MEMSPACE, true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& /*space*/, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_BLAS,complex]"); \ + HostBlas>::rot(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1, c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -186,230 +151,149 @@ KOKKOSBLAS1_CROT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,double]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasDrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,double]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasDrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,float]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasSrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, \ - c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,float]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasSrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_CUBLAS,complex]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasZdrot(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, c.data(), \ - s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasZdrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rot*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using ScalarView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rot(EXECSPACE const& space, VectorView const& X, \ - VectorView const& Y, ScalarView const& c, \ - ScalarView const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rot[TPL_CUBLAS,complex]"); \ - rot_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - cublasCsrot(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, c.data(), \ - s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rot< \ + EXECSPACE, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using VectorView = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using ScalarView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rot(EXECSPACE const& space, VectorView const& X, VectorView const& Y, ScalarView const& c, \ + ScalarView const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ + rot_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + cublasCsrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS diff --git a/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp index ea94ff04dc..f8b8184b80 100644 --- a/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rotg_tpl_spec_avail.hpp @@ -32,157 +32,90 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rotg_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rotg_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct rotg_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct rotg_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTG_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp index ee6a6c8c04..e6583d5ae3 100644 --- a/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp @@ -24,8 +24,8 @@ namespace { template inline void rotg_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::rotg<> TPL Blas specialization for < %s, %s >\n", - typeid(Scalar).name(), typeid(ExecutionSpace).name); + printf("KokkosBlas1::rotg<> TPL Blas specialization for < %s, %s >\n", typeid(Scalar).name(), + typeid(ExecutionSpace).name); #endif } } // namespace @@ -39,184 +39,130 @@ inline void rotg_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,double]"); \ - HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,double]"); \ + HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,float]"); \ - HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,float]"); \ + HostBlas::rotg(a.data(), b.data(), c.data(), s.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_BLAS,complex]"); \ - HostBlas>::rotg( \ - reinterpret_cast*>(a.data()), \ - reinterpret_cast*>(b.data()), c.data(), \ - reinterpret_cast*>(s.data())); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,complex]"); \ + HostBlas>::rotg(reinterpret_cast*>(a.data()), \ + reinterpret_cast*>(b.data()), c.data(), \ + reinterpret_cast*>(s.data())); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, \ - MViewType const& c, SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_BLAS,complex]"); \ - HostBlas>::rotg( \ - reinterpret_cast*>(a.data()), \ - reinterpret_cast*>(b.data()), c.data(), \ - reinterpret_cast*>(s.data())); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_BLAS,complex]"); \ + HostBlas>::rotg(reinterpret_cast*>(a.data()), \ + reinterpret_cast*>(b.data()), c.data(), \ + reinterpret_cast*>(s.data())); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl @@ -231,231 +177,151 @@ KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,double]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotg(singleton.handle, a.data(), \ - b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,double]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,float]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotg(singleton.handle, a.data(), \ - b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,float]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZrotg( \ - singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCrotg( \ - singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -469,201 +335,137 @@ KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,double]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotg( \ - singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,double]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,float]"); \ - rotg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotg( \ - singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,float]"); \ + rotg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zrotg( \ - singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zrotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, \ - SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode( \ - singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_crotg( \ - singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, true, \ + ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_crotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp index 2a1ee21cc6..84e7452e65 100644 --- a/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rotm_tpl_spec_avail.hpp @@ -34,90 +34,65 @@ namespace Impl { // ARMPL is disabled as it does not detect some corner // cases correctly which leads to failing unit-tests #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) -#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotm_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotm_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotm_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotm_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_ROTM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp index ce8826e1ee..7bde6d0835 100644 --- a/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void rotm_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::rotm<> TPL Blas specialization for < %s >\n", - typeid(Scalar).name()); + printf("KokkosBlas1::rotm<> TPL Blas specialization for < %s >\n", typeid(Scalar).name()); #endif } } // namespace @@ -39,68 +38,45 @@ inline void rotm_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ParamView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotm(EXEC_SPACE const& /* space */, VectorView& X, \ - VectorView& Y, ParamView& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_BLAS,SCALAR]"); \ - HostBlas::rotm(X.extent(0), X.data(), 1, Y.data(), 1, \ - param.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ParamView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rotm(EXEC_SPACE const& /* space */, VectorView& X, VectorView& Y, ParamView& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_BLAS,SCALAR]"); \ + HostBlas::rotm(X.extent(0), X.data(), 1, Y.data(), 1, param.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl @@ -115,101 +91,69 @@ KOKKOSBLAS1_ROTM_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ParamView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, ParamView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,double]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotm( \ - s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ParamView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,double]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using ParamView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, ParamView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,float]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotm( \ - s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using ParamView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, ParamView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,float]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -223,103 +167,71 @@ KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,double]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_drotm(s.handle, static_cast(X.extent(0)), X.data(), 1, \ - Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,double]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_drotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotm< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using VectorView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotm(EXEC_SPACE const& space, VectorView const& X, \ - VectorView const& Y, PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,float]"); \ - rotm_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_srotm(s.handle, static_cast(X.extent(0)), X.data(), 1, \ - Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +#define KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotm< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using VectorView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotm(EXEC_SPACE const& space, VectorView const& X, VectorView const& Y, PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,float]"); \ + rotm_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_srotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp index d4db1143f9..3a2925fd49 100644 --- a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_avail.hpp @@ -33,88 +33,66 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) // ARMPL is disabled as it does not detect some corner // cases correctly which leads to failing unit-tests -#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct rotmg_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct rotmg_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Turning off use of rocBLAS as it returns false results in some of the diff --git a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp index e911294df4..0271cfd981 100644 --- a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp @@ -24,8 +24,7 @@ namespace { template inline void rotmg_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas1::rotmg<> TPL Blas specialization for < %s >\n", - typeid(Scalar).name()); + printf("KokkosBlas1::rotmg<> TPL Blas specialization for < %s >\n", typeid(Scalar).name()); #endif } } // namespace @@ -33,80 +32,54 @@ inline void rotmg_print_specialization() { } // namespace KokkosBlas // Generic Host side BLAS (could be MKL or whatever) -#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void rotmg(EXEC_SPACE const& /* space */, DXView& d1, DXView& d2, \ - DXView& x1, YView& y1, PView& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_BLAS,double]"); \ - HostBlas::rotmg(d1.data(), d2.data(), x1.data(), y1.data(), \ - param.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void rotmg(EXEC_SPACE const& /* space */, DXView& d1, DXView& d2, DXView& x1, YView& y1, PView& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_BLAS,double]"); \ + HostBlas::rotmg(d1.data(), d2.data(), x1.data(), y1.data(), param.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace, false) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, true) -KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl @@ -121,114 +94,77 @@ KOKKOSBLAS1_ROTMG_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,double]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,double]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,float]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetPointerMode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,float]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + cublasPointerMode_t pointer_mode; \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -242,114 +178,79 @@ KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,double]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,double]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_drotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotmg< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using DXView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YView = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PView = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void rotmg(EXEC_SPACE const& space, DXView const& d1, \ - DXView const& d2, DXView const& x1, YView const& y1, \ - PView const& param) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,float]"); \ - rotmg_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotmg(s.handle, d1.data(), \ - d2.data(), x1.data(), \ - y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotmg< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using DXView = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PView = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void rotmg(EXEC_SPACE const& space, DXView const& d1, DXView const& d2, DXView const& x1, YView const& y1, \ + PView const& param) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,float]"); \ + rotmg_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_srotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index 5c5a6008ec..b5efa5c3a4 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_tpl_spec_avail { enum : bool { value = false }; }; @@ -34,98 +33,71 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) // double -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct scal_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct scal_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // double -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct scal_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct scal_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct scal_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct scal_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index da11555f7b..7083e28730 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -24,8 +24,8 @@ namespace { template inline void scal_print_specialization() { #if defined(KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION) - printf("KokkosBlas1::scal<> TPL Blas specialization for < %s , %s , %s >\n", - typeid(RV).name(), typeid(AS).name(), typeid(XV).name()); + printf("KokkosBlas1::scal<> TPL Blas specialization for < %s , %s , %s >\n", typeid(RV).name(), typeid(AS).name(), + typeid(XV).name()); #endif } } // namespace @@ -38,87 +38,63 @@ inline void scal_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Scal< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ - const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - const BASE_SCALAR_TYPE alpha_b = static_cast(alpha); \ - HostBlas::scal( \ - N, alpha_b, reinterpret_cast(R.data()), one); \ - } else { \ - Scal::scal(space, R, \ - alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + const BASE_SCALAR_TYPE alpha_b = static_cast(alpha); \ + HostBlas::scal(N, alpha_b, reinterpret_cast(R.data()), one); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, \ - std::complex, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex, std::complex, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -132,117 +108,81 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ - CUBLAS_FN, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Scal< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ - const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Scal::scal(space, R, \ - alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(double, double, cublasDscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(float, float, cublasSscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, \ - cuDoubleComplex, cublasZscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, \ - cublasCscal, LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(double, double, cublasDscal, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(float, float, cublasSscal, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasZscal, LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - false) +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, cuComplex, cublasCscal, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -256,105 +196,73 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Scal< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const execution_space& space, const RV& R, \ - const AS& alpha, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - } else { \ - Scal::scal(space, R, \ - alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const execution_space& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \ - LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \ +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ - EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ - EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp index 14ecce2740..de1fa19cb3 100644 --- a/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_swap_tpl_spec_avail.hpp @@ -34,132 +34,83 @@ namespace Impl { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ - template <> \ - struct swap_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct swap_tpl_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #ifdef KOKKOS_ENABLE_OPENMP KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct swap_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct swap_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ - struct swap_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct swap_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SWAP_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp index 555c942c12..e74b498c33 100644 --- a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp @@ -26,9 +26,8 @@ namespace { template inline void swap_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - printf("KokkosBlas::swap<> TPL Blas specialization for < %s, %s, %s >\n", - typeid(XVector).name(), typeid(YVector).name(), - typeid(ExecutionSpace).name); + printf("KokkosBlas::swap<> TPL Blas specialization for < %s, %s, %s >\n", typeid(XVector).name(), + typeid(YVector).name(), typeid(ExecutionSpace).name); #endif } } // namespace @@ -42,110 +41,82 @@ inline void swap_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,double]"); \ - HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,double]"); \ + HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,float]"); \ - HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,float]"); \ + HostBlas::swap(X.extent_int(0), X.data(), 1, Y.data(), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_BLAS,complex]"); \ - HostBlas>::swap( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,complex]"); \ + HostBlas>::swap(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& /*space*/, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_BLAS,complex]"); \ - HostBlas>::swap( \ - X.extent_int(0), reinterpret_cast*>(X.data()), \ - 1, reinterpret_cast*>(Y.data()), 1); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_BLAS(LAYOUT, EXECSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& /*space*/, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_BLAS,complex]"); \ + HostBlas>::swap(X.extent_int(0), reinterpret_cast*>(X.data()), 1, \ + reinterpret_cast*>(Y.data()), 1); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -188,201 +159,131 @@ KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, false) namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_CUBLAS,complex]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasZswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_CUBLAS,complex]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::CudaBlasSingleton& singleton = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS @@ -394,169 +295,115 @@ KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sswap( \ - singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zswap( \ - singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, \ - YVector const& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cswap( \ - singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) - -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS diff --git a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp index 88a60e6d19..55e1383ed7 100644 --- a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp @@ -20,8 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 661393e445..679a5ddace 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -28,46 +28,34 @@ struct gemv_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTX, \ - LAYOUTY, MEMSPACE) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTX, LAYOUTY, MEMSPACE) \ + template \ + struct gemv_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif @@ -75,20 +63,16 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTX, \ - LAYOUTY, MEMSPACE) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTX, LAYOUTY, MEMSPACE) \ + template \ + struct gemv_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Note BMK: We use the same layout for A, X and Y because the GEMV @@ -96,30 +80,22 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // So this TPL version will match any layout combination, as long // as none are LayoutStride. -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif @@ -127,35 +103,27 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) #endif @@ -163,38 +131,31 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #if defined(KOKKOS_ENABLE_SYCL) -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ - template \ - struct gemv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutLeft) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutRight) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutRight) -KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, - Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, Kokkos::LayoutRight) #endif diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 07d9476b66..fcc5762f57 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -43,215 +43,157 @@ namespace Impl { transa = 'C'; \ } -#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ - one, beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), one, beta, Y.data(), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,float]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), one, \ - beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), one, beta, Y.data(), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_BLAS,complex]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - const std::complex alpha_val = alpha, beta_val = beta; \ - HostBlas >::gemv( \ - transa, M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(X.data()), one, \ - beta_val, reinterpret_cast*>(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,complex]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv(transa, M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(X.data()), one, beta_val, \ + reinterpret_cast*>(Y.data()), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& /* space */, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_BLAS,complex]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - const std::complex alpha_val = alpha, beta_val = beta; \ - HostBlas >::gemv( \ - transa, M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(X.data()), one, \ - beta_val, reinterpret_cast*>(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,complex]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv(transa, M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(X.data()), one, beta_val, \ + reinterpret_cast*>(Y.data()), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -284,238 +226,169 @@ namespace Impl { transa = CUBLAS_OP_C; \ } -#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDgemv(s.handle, transa, M, N, &alpha, \ - A.data(), LDA, X.data(), one, \ - &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSgemv(s.handle, transa, M, N, &alpha, \ - A.data(), LDA, X.data(), one, \ - &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasZgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemv( \ + s.handle, transa, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ + one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgemv( \ - s.handle, transa, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCgemv(s.handle, transa, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ + one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -548,198 +421,152 @@ namespace Impl { transa = rocblas_operation_conjugate_transpose; \ } -#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) @@ -782,8 +609,7 @@ inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { case 'C': return oneapi::mkl::transpose::conjtrans; default:; } - throw std::invalid_argument( - "Invalid mode for oneMKL (should be one of N, T, C)"); + throw std::invalid_argument("Invalid mode for oneMKL (should be one of N, T, C)"); } template @@ -797,78 +623,58 @@ struct kokkos_to_std_type_map { using type = std::complex::mag_type>; }; -#define KOKKOSBLAS2_GEMV_ONEMKL(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - using device_type = Kokkos::Device; \ - using mem_traits = Kokkos::MemoryTraits; \ - using AViewType = \ - Kokkos::View; \ - using XViewType = \ - Kokkos::View; \ - using YViewType = Kokkos::View; \ - \ - static void gemv(const ExecSpace& exec, const char kk_trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - if (beta == Kokkos::ArithTraits::zero()) { \ - Kokkos::deep_copy(Y, Kokkos::ArithTraits::zero()); \ - } \ - \ - bool row_major = std::is_same::value; \ - const std::int64_t M = A.extent(0); \ - const std::int64_t N = A.extent(1); \ - oneapi::mkl::transpose trans = mode_kk_to_onemkl(kk_trans[0]); \ - const std::int64_t LDA = row_major ? A.stride(0) : A.stride(1); \ - std::string label = "KokkosBlas::gemv[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - \ - Kokkos::Profiling::pushRegion(label); \ - using mag_type = kokkos_to_std_type_map< \ - SCALAR, Kokkos::ArithTraits::is_complex>::type; \ - const mag_type* a = reinterpret_cast(A.data()); \ - const mag_type* x = reinterpret_cast(X.data()); \ - mag_type* y = reinterpret_cast(Y.data()); \ - if (row_major) { \ - oneapi::mkl::blas::row_major::gemv(exec.sycl_queue(), trans, M, N, \ - alpha, a, LDA, x, 1, beta, y, 1); \ - } else { \ - oneapi::mkl::blas::column_major::gemv( \ - exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_GEMV_ONEMKL(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using device_type = Kokkos::Device; \ + using mem_traits = Kokkos::MemoryTraits; \ + using AViewType = Kokkos::View; \ + using XViewType = Kokkos::View; \ + using YViewType = Kokkos::View; \ + \ + static void gemv(const ExecSpace& exec, const char kk_trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + if (beta == Kokkos::ArithTraits::zero()) { \ + Kokkos::deep_copy(Y, Kokkos::ArithTraits::zero()); \ + } \ + \ + bool row_major = std::is_same::value; \ + const std::int64_t M = A.extent(0); \ + const std::int64_t N = A.extent(1); \ + oneapi::mkl::transpose trans = mode_kk_to_onemkl(kk_trans[0]); \ + const std::int64_t LDA = row_major ? A.stride(0) : A.stride(1); \ + std::string label = "KokkosBlas::gemv[TPL_ONEMKL," + Kokkos::ArithTraits::name() + "]"; \ + \ + Kokkos::Profiling::pushRegion(label); \ + using mag_type = kokkos_to_std_type_map::is_complex>::type; \ + const mag_type* a = reinterpret_cast(A.data()); \ + const mag_type* x = reinterpret_cast(X.data()); \ + mag_type* y = reinterpret_cast(Y.data()); \ + if (row_major) { \ + oneapi::mkl::blas::row_major::gemv(exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ + } else { \ + oneapi::mkl::blas::column_major::gemv(exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) -KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Experimental::SYCLDeviceUSMSpace, true) } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 3013689f34..b6156c2d3a 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -28,62 +28,40 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif @@ -91,112 +69,68 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // We use the same layout for X, Y and Abecause the GER interface will // switch the layouts of X and Y to that of A. So this TPL version will // match any layout combination, as long as none are LayoutStride. -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index bc1a10f61e..680df7c464 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -30,308 +30,225 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ - const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ - A.data(), LDA); \ - } else { \ - HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ - A.data(), LDA); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char /*trans*/[], typename AViewType::const_value_type& alpha, const XViewType& X, \ + const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ - const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ - A.data(), LDA); \ - } else { \ - HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ - A.data(), LDA); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char /*trans*/[], typename AViewType::const_value_type& alpha, const XViewType& X, \ + const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = \ - static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - HostBlas>::gerc( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } \ - } else { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc(M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = \ - static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - HostBlas>::gerc( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } \ - } else { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc(M, N, alpha_val, reinterpret_cast*>(X.data()), \ + one, reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru(M, N, alpha_val, reinterpret_cast*>(Y.data()), \ + one, reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index 3f80144f62..fdb09d1c91 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -30,324 +30,231 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), one, \ - A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), one, \ - A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), one, \ - A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), one, \ - A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index c21b61befa..26a0da5864 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -30,295 +30,221 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), \ - one, A.data(), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), \ - one, A.data(), LDA)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), \ - one, A.data(), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), \ - one, A.data(), LDA)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ - GER::ger(space, trans, alpha, X, Y, A); \ - } \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index 6f6a7a2e9f..d894433540 100644 --- a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -35,14 +35,12 @@ namespace Impl { // Note: using GEMM because there is no GEMV in MKL compact routines -#define __IMPL_KK_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ - inline void kk_mkl_gemm_compact( \ - MKL_LAYOUT layout, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, \ - MKL_INT m, MKL_INT n, MKL_INT k, SCALAR alpha, const SCALAR *a, \ - MKL_INT ldap, const SCALAR *b, MKL_INT ldbp, SCALAR beta, SCALAR *c, \ - MKL_INT ldcp, MKL_COMPACT_PACK format, MKL_INT nm) { \ - MKL_ROUTINE(layout, transa, transb, m, n, k, alpha, a, ldap, b, ldbp, \ - beta, c, ldcp, format, nm); \ +#define __IMPL_KK_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ + inline void kk_mkl_gemm_compact(MKL_LAYOUT layout, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, MKL_INT m, MKL_INT n, \ + MKL_INT k, SCALAR alpha, const SCALAR *a, MKL_INT ldap, const SCALAR *b, \ + MKL_INT ldbp, SCALAR beta, SCALAR *c, MKL_INT ldcp, MKL_COMPACT_PACK format, \ + MKL_INT nm) { \ + MKL_ROUTINE(layout, transa, transb, m, n, k, alpha, a, ldap, b, ldbp, beta, c, ldcp, format, nm); \ } __IMPL_KK_MKL_DGEMM_COMPACT(double, mkl_dgemm_compact) @@ -81,23 +79,17 @@ inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_AVX512; } -template -void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, - const AViewType &A, const xViewType &x, const ScalarType beta, - const yViewType &y) { +template +void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, const AViewType &A, const xViewType &x, + const ScalarType beta, const yViewType &y) { typedef typename yViewType::value_type vector_type; - static_assert(KokkosBatched::is_vector::value, - "value type is not vector type"); + static_assert(KokkosBatched::is_vector::value, "value type is not vector type"); using value_type = typename vector_type::value_type; - static_assert(std::is_same::value && - std::is_same::value, + static_assert(std::is_same::value && + std::is_same::value, "scalar type mismatch"); - if (A.stride_0() != 1 && A.stride_1() != 1 && x.stride_0() != 1 && - y.stride_0() != 1) { + if (A.stride_0() != 1 && A.stride_1() != 1 && x.stride_0() != 1 && y.stride_0() != 1) { Kokkos::abort("Strided inputs are not supported in MKL gemv/gemm"); } @@ -107,21 +99,18 @@ void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, const int n = 1; const int k = A.extent_int(transposed ? 0 : 1); - const bool col_major = A.stride_0() == 1; - const MKL_LAYOUT layout = col_major ? MKL_COL_MAJOR : MKL_ROW_MAJOR; - const MKL_INT A_ld = KOKKOSKERNELS_MACRO_MAX(1, A.extent(col_major ? 0 : 1)); - const MKL_COMPACT_PACK format = - Impl::mkl_compact_format(); + const bool col_major = A.stride_0() == 1; + const MKL_LAYOUT layout = col_major ? MKL_COL_MAJOR : MKL_ROW_MAJOR; + const MKL_INT A_ld = KOKKOSKERNELS_MACRO_MAX(1, A.extent(col_major ? 0 : 1)); + const MKL_COMPACT_PACK format = Impl::mkl_compact_format(); // cast away simd-vector pointers auto A_data = reinterpret_cast(A.data()); auto x_data = reinterpret_cast(x.data()); auto y_data = reinterpret_cast(y.data()); - Impl::kk_mkl_gemm_compact(layout, trans, MKL_NOTRANS, m, n, k, - (value_type)alpha, A_data, A_ld, x_data, 1, - (value_type)beta, y_data, 1, format, - (MKL_INT)vector_type::vector_length); + Impl::kk_mkl_gemm_compact(layout, trans, MKL_NOTRANS, m, n, k, (value_type)alpha, A_data, A_ld, x_data, 1, + (value_type)beta, y_data, 1, format, (MKL_INT)vector_type::vector_length); } } // namespace Impl @@ -131,12 +120,9 @@ void kk_mkl_gemv(MKL_TRANSPOSE trans, const ScalarType alpha, /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { Impl::kk_mkl_gemv(MKL_NOTRANS, alpha, A, x, beta, y); return 0; } @@ -146,12 +132,9 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { Impl::kk_mkl_gemv(MKL_TRANS, alpha, A, x, beta, y); return 0; } @@ -161,12 +144,9 @@ SerialGemv::invoke( /// template <> -template -KOKKOS_INLINE_FUNCTION int -SerialGemv::invoke( - const ScalarType alpha, const AViewType &A, const xViewType &x, - const ScalarType beta, const yViewType &y) { +template +KOKKOS_INLINE_FUNCTION int SerialGemv::invoke( + const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { Impl::kk_mkl_gemv(MKL_CONJTRANS, alpha, A, x, beta, y); return 0; } diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp index 59fb154d35..2c3cdc990e 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp @@ -28,66 +28,40 @@ struct syr2_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr2_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif @@ -95,108 +69,64 @@ KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr2_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr2_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) - -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp index f22e800bc5..4aa32b5b0e 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp @@ -29,286 +29,216 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,double]"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ - one, A.data(), LDA); \ - } else { \ - /* blasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + /* blasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,float]"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ - one, A.data(), LDA); \ - } else { \ - /* blasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), one, A.data(), LDA); \ + } else { \ + /* blasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasZsyr2() => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } else { \ - if (A_is_ll) { \ - HostBlas>::her2( \ - uplo[0], N, alpha, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::her2(uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasCsyr2() => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } else { \ - if (A_is_ll) { \ - HostBlas>::her2( \ - uplo[0], N, alpha, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::her2(uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp index ca98fedf0d..4dd95aa79a 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp @@ -22,349 +22,257 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? CUBLAS_FILL_MODE_LOWER \ - : CUBLAS_FILL_MODE_UPPER; +#define KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; -#define KOKKOSBLAS2_DSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCsyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp index 869c065af2..84085224ac 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -28,307 +28,233 @@ namespace Impl { const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? rocblas_fill_lower \ - : rocblas_fill_upper; + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') ? rocblas_fill_lower : rocblas_fill_upper; -#define KOKKOSBLAS2_DSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ - Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR2*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr2(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, \ + Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp index f537b3854a..e1eb94e425 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -28,59 +28,38 @@ struct syr_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif @@ -88,102 +67,60 @@ KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct syr_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) - -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) -KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index fc8fb949d7..5b0eb0ec52 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -29,254 +29,186 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ - LDA); \ - } else { \ - /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), LDA); \ + } else { \ + /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ - LDA); \ - } else { \ - /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), LDA); \ + } else { \ + /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasZsyr() => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } else { \ - if (A_is_ll) { \ - HostBlas>::her( \ - uplo[0], N, alpha.real(), \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr() => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::her(uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_BLAS,complex"); \ - KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - /* No blasCsyr() => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - HostBlas>::her( \ - uplo[0], N, alpha.real(), \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr() => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + HostBlas>::her(uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) - -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, - false) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - true) -KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, - false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index dad3c93dbc..43b177d9a5 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -22,309 +22,224 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ - const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ - constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? CUBLAS_FILL_MODE_LOWER \ - : CUBLAS_FILL_MODE_UPPER; - -#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher( \ - s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCsyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) - -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, - false) - -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - true) -KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, - false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index cf02e9e207..59c99c1225 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -28,283 +28,205 @@ namespace Impl { const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ - ? rocblas_fill_lower \ - : rocblas_fill_upper; + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') ? rocblas_fill_lower : rocblas_fill_upper; -#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr( \ - s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher( \ - s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, \ - const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr( \ - s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher( \ - s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr( \ - space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, - false) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - true) -KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, - false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 8e96898b10..0dd3ef81e9 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -28,46 +28,34 @@ struct gemm_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) -#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEMSPACE) \ - template \ - struct gemm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, MEMSPACE) \ + template \ + struct gemm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif @@ -75,111 +63,78 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) -#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEMSPACE) \ - template \ - struct gemm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, MEMSPACE) \ + template \ + struct gemm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct gemm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct gemm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) - -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, - Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIPSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) + +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIPSpace) +KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 68bf2708ec..52123a9daf 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -23,130 +23,92 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const ExecSpace& /* space*/, const char transA[], \ - const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const KK_INT M = C.extent(0); \ - const KK_INT N = C.extent(1); \ - const KK_INT K = A.extent(A_t ? 0 : 1); \ - \ - bool A_is_lr = std::is_same::value; \ - bool B_is_lr = std::is_same::value; \ - bool C_is_lr = std::is_same::value; \ - \ - const KK_INT AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const KK_INT BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const KK_INT CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - const BASE_SCALAR_TYPE alpha_val = alpha, beta_val = beta; \ - if (!A_is_lr && !B_is_lr && !C_is_lr) \ - HostBlas::gemm( \ - transA[0], transB[0], M, N, K, alpha_val, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - beta_val, reinterpret_cast(C.data()), LDC); \ - if (A_is_lr && B_is_lr && C_is_lr) \ - HostBlas::gemm( \ - transB[0], transA[0], N, M, K, alpha_val, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - beta_val, reinterpret_cast(C.data()), LDC); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const ExecSpace& /* space*/, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS," #SCALAR_TYPE "]"); \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const KK_INT M = C.extent(0); \ + const KK_INT N = C.extent(1); \ + const KK_INT K = A.extent(A_t ? 0 : 1); \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + const KK_INT AST = A_is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const KK_INT BST = B_is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const KK_INT CST = C_is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + const BASE_SCALAR_TYPE alpha_val = alpha, beta_val = beta; \ + if (!A_is_lr && !B_is_lr && !C_is_lr) \ + HostBlas::gemm(transA[0], transB[0], M, N, K, alpha_val, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, beta_val, \ + reinterpret_cast(C.data()), LDC); \ + if (A_is_lr && B_is_lr && C_is_lr) \ + HostBlas::gemm(transB[0], transA[0], N, M, K, alpha_val, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, beta_val, \ + reinterpret_cast(C.data()), LDC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_DGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(double, double, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_DGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(double, double, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS3_SGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(float, float, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS3_ZGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_SGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(float, float, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_CGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, \ - LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS3_CGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, \ - LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) - -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -160,195 +122,131 @@ KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \ - LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const ExecSpace& space, const char transA[], \ - const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool A_is_lr = std::is_same::value; \ - bool B_is_lr = std::is_same::value; \ - bool C_is_lr = std::is_same::value; \ - \ - const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - cublasOperation_t transa = trans_mode_kk_to_cublas(transA); \ - cublasOperation_t transb = trans_mode_kk_to_cublas(transB); \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutLeftThreshold) || \ - (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm( \ - alpha, A, B, beta, C); \ - bool conjT = (std::is_same::value || \ - std::is_same::value) \ - ? false \ - : (transa == CUBLAS_OP_C ? true : false); \ - gemm.run(space, conjT); \ - } else { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (!A_is_lr && !B_is_lr && !C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - if (A_is_lr && B_is_lr && C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const ExecSpace& space, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + const int AST = A_is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const int BST = B_is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const int CST = C_is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + cublasOperation_t transa = trans_mode_kk_to_cublas(transA); \ + cublasOperation_t transb = trans_mode_kk_to_cublas(transB); \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M * N < numDotsLayoutLeftThreshold) || \ + (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm(alpha, A, B, beta, C); \ + bool conjT = (std::is_same::value || std::is_same::value) \ + ? false \ + : (transa == CUBLAS_OP_C ? true : false); \ + gemm.run(space, conjT); \ + } else { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (!A_is_lr && !B_is_lr && !C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, transa, transb, M, N, K, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ + if (A_is_lr && B_is_lr && C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, transb, transa, N, M, K, reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(double, double, cublasDgemm, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(double, double, cublasDgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(float, float, cublasSgemm, LAYOUTA, LAYOUTB, \ - LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(float, float, cublasSgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuDoubleComplex, \ - cublasZgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasZgemm, LAYOUTA, LAYOUTB, LAYOUTC, \ + MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuComplex, cublasCgemm, \ - LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex, cuComplex, cublasCgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace, false) - -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -362,120 +260,93 @@ KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, \ - ROCBLAS_FN, LAYOUT, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const typename CViewType::execution_space& space, \ - const char transA[], const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool is_lr = std::is_same::value; \ - \ - const int AST = is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ - rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!is_lr && transa != rocblas_operation_none && \ - transb == rocblas_operation_none && \ - M * N < numDotsLayoutLeftThreshold) || \ - (is_lr && transa != rocblas_operation_none && \ - transb == rocblas_operation_none && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm( \ - alpha, A, B, beta, C); \ - bool conjT = \ - (std::is_same::value || \ - std::is_same::value) \ - ? false \ - : (transa == rocblas_operation_conjugate_transpose ? true \ - : false); \ - gemm.run(space, conjT); \ - } else { \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (!is_lr) \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ - s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - else \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ - s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const typename CViewType::execution_space& space, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool is_lr = std::is_same::value; \ + \ + const int AST = is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const int BST = is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const int CST = is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ + rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutLeftThreshold) || \ + (is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm(alpha, A, B, beta, C); \ + bool conjT = (std::is_same::value || std::is_same::value) \ + ? false \ + : (transa == rocblas_operation_conjugate_transpose ? true : false); \ + gemm.run(space, conjT); \ + } else { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (!is_lr) \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transa, transb, M, N, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + else \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transb, transa, N, M, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_DGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(double, double, rocblas_dgemm, LAYOUT, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_DGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(double, double, rocblas_dgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_SGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(float, float, rocblas_sgemm, LAYOUT, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_SGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(float, float, rocblas_sgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_double_complex, \ - rocblas_zgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_ZGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_double_complex, rocblas_zgemm, LAYOUT, MEM_SPACE, \ + ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_CGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_float_complex, \ - rocblas_cgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_CGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex, rocblas_float_complex, rocblas_cgemm, LAYOUT, MEM_SPACE, \ + ETI_SPEC_AVAIL) KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index 010b44a154..83e39a240e 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -29,38 +29,26 @@ struct trmm_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trmm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trmm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS @@ -68,61 +56,40 @@ KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trmm_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trmm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp index 53c73f7416..4e68c08dec 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp @@ -24,136 +24,103 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trmm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - bool B_is_layout_left = \ - std::is_same::value; \ - \ - const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_layout_left) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - if (A_is_layout_left) \ - HostBlas::trmm( \ - side_, uplo_, trans[0], diag[0], M, N, alpha, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - else \ - HostBlas::trmm( \ - side_, uplo_, trans[0], diag[0], N, M, alpha, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trmm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS," #SCALAR_TYPE "]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + bool B_is_layout_left = std::is_same::value; \ + \ + const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_layout_left) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + if (A_is_layout_left) \ + HostBlas::trmm(side_, uplo_, trans[0], diag[0], M, N, alpha, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB); \ + else \ + HostBlas::trmm(side_, uplo_, trans[0], diag[0], N, M, alpha, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_DTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(double, double, LAYOUTA, LAYOUTB, MEM_SPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_BLAS(double, double, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS3_STRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(float, float, LAYOUTA, LAYOUTB, MEM_SPACE, \ - ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_BLAS(float, float, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS3_ZTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, \ - LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_CTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, \ - LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS3_CTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex, std::complex, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) // Explicitly define the TRMM class for all permutations listed below -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) - -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) - -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) - -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -166,196 +133,143 @@ KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \ - LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trmm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trmm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - bool B_is_layout_left = \ - std::is_same::value; \ - \ - const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_layout_left) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_layout_left) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), \ - LDA, reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), \ - LDA, reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trmm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + bool B_is_layout_left = std::is_same::value; \ + \ + const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_layout_left ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_layout_left) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_layout_left) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ + LDB, reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ + LDB, reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_DTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(double, double, cublasDtrmm, LAYOUTA, LAYOUTB, \ - MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_CUBLAS(double, double, cublasDtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS3_STRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(float, float, cublasStrmm, LAYOUTA, LAYOUTB, \ - MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_CUBLAS(float, float, cublasStrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS3_ZTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuDoubleComplex, \ - cublasZtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, \ +#define KOKKOSBLAS3_ZTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuDoubleComplex, cublasZtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, \ ETI_SPEC_AVAIL) #define KOKKOSBLAS3_CTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuComplex, cublasCtrmm, \ - LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) + KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex, cuComplex, cublasCtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) // Explicitly define the TRMM class for all permutations listed below -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) - -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) - -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp index d1836809ec..21289655de 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp @@ -29,38 +29,26 @@ struct trsm_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trsm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trsm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif @@ -68,61 +56,40 @@ KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, \ - MEMSPACE) \ - template \ - struct trsm_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB, MEMSPACE) \ + template \ + struct trsm_tpl_spec_avail, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, +KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp index ec36388094..7074a4e0e2 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp @@ -23,329 +23,275 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_DTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - if (A_is_ll) \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, \ - A.data(), LDA, B.data(), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_DTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + if (A_is_ll) \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, A.data(), LDA, B.data(), LDB); \ + else \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, A.data(), LDA, B.data(), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_STRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - if (A_is_ll) \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ - HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, \ - A.data(), LDA, B.data(), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_STRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + if (A_is_ll) \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, A.data(), LDA, B.data(), LDB); \ + else \ + HostBlas::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, A.data(), LDA, B.data(), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_BLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - const std::complex alpha_val = alpha; \ - if (A_is_ll) \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - else \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + const std::complex alpha_val = alpha; \ + if (A_is_ll) \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + else \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_BLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - char side_; \ - char uplo_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'L'; \ - else \ - side_ = 'R'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'L'; \ - else \ - uplo_ = 'U'; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = 'R'; \ - else \ - side_ = 'L'; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = 'U'; \ - else \ - uplo_ = 'L'; \ - } \ - \ - const std::complex alpha_val = alpha; \ - if (A_is_ll) \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - else \ - HostBlas >::trsm( \ - side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(B.data()), LDB); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& /*space*/, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + char side_; \ + char uplo_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'L'; \ + else \ + side_ = 'R'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'L'; \ + else \ + uplo_ = 'U'; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = 'R'; \ + else \ + side_ = 'L'; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = 'U'; \ + else \ + uplo_ = 'L'; \ + } \ + \ + const std::complex alpha_val = alpha; \ + if (A_is_ll) \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + else \ + HostBlas >::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(B.data()), LDB); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::HostSpace, false) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, true) -KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::HostSpace, false) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas @@ -358,450 +304,370 @@ KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_DTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_DTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_STRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_STRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ - s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ - s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], const char diag[], \ - typename BViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), \ - LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas_Cuda_tpl.hpp b/blas/tpls/KokkosBlas_Cuda_tpl.hpp index d85785316e..d80e3a23d8 100644 --- a/blas/tpls/KokkosBlas_Cuda_tpl.hpp +++ b/blas/tpls/KokkosBlas_Cuda_tpl.hpp @@ -24,8 +24,7 @@ namespace Impl { CudaBlasSingleton::CudaBlasSingleton() { cublasStatus_t stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); Kokkos::push_finalize_hook([&]() { cublasDestroy(handle); }); } diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index dc04ca7e67..6989aea34d 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -34,63 +34,41 @@ void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, /* */ float* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, /* */ double* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(cscal, - CSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(zscal, - ZSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(cscal, CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(zscal, ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); /// /// max /// -KK_INT F77_BLAS_MANGLE(isamax, ISAMAX)(const KK_INT* N, const float* x, - const KK_INT* x_inc); -KK_INT F77_BLAS_MANGLE(idamax, IDAMAX)(const KK_INT* N, const double* x, - const KK_INT* x_inc); -KK_INT F77_BLAS_MANGLE(icamax, ICAMAX)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); -KK_INT F77_BLAS_MANGLE(izamax, IZAMAX)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(isamax, ISAMAX)(const KK_INT* N, const float* x, const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(idamax, IDAMAX)(const KK_INT* N, const double* x, const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(icamax, ICAMAX)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(izamax, IZAMAX)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); /// /// nrm2 /// -float F77_BLAS_MANGLE(snrm2, SNRM2)(const KK_INT* N, const float* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dnrm2, DNRM2)(const KK_INT* N, const double* x, - const KK_INT* x_inc); -float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); +float F77_BLAS_MANGLE(snrm2, SNRM2)(const KK_INT* N, const float* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dnrm2, DNRM2)(const KK_INT* N, const double* x, const KK_INT* x_inc); +float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); /// /// sum /// -float F77_BLAS_MANGLE(sasum, SASUM)(const KK_INT* N, const float* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dasum, DASUM)(const KK_INT* N, const double* x, - const KK_INT* x_inc); -float F77_BLAS_MANGLE(scasum, SCASUM)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); -double F77_BLAS_MANGLE(dzasum, DZASUM)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc); +float F77_BLAS_MANGLE(sasum, SASUM)(const KK_INT* N, const float* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dasum, DASUM)(const KK_INT* N, const double* x, const KK_INT* x_inc); +float F77_BLAS_MANGLE(scasum, SCASUM)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); +double F77_BLAS_MANGLE(dzasum, DZASUM)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc); /// /// dot /// -float F77_BLAS_MANGLE(sdot, SDOT)(const KK_INT* N, const float* x, - const KK_INT* x_inc, const float* y, +float F77_BLAS_MANGLE(sdot, SDOT)(const KK_INT* N, const float* x, const KK_INT* x_inc, const float* y, const KK_INT* y_inc); -double F77_BLAS_MANGLE(ddot, DDOT)(const KK_INT* N, const double* x, - const KK_INT* x_inc, const double* y, +double F77_BLAS_MANGLE(ddot, DDOT)(const KK_INT* N, const double* x, const KK_INT* x_inc, const double* y, const KK_INT* y_inc); #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) // clang-format off @@ -106,77 +84,49 @@ typedef struct { double vals[2]; } _kk_double2; -_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); -_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); -_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); -_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const KK_INT* N, - const std::complex* x, - const KK_INT* x_inc, - const std::complex* y, - const KK_INT* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const KK_INT* N, const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); #else -void F77_BLAS_MANGLE(cdotu, - CDOTU)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(zdotu, - ZDOTU)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(cdotc, - CDOTC)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(zdotc, - ZDOTC)(std::complex* res, const KK_INT* N, - const std::complex* x, const KK_INT* x_inc, - const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(cdotu, CDOTU)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(zdotu, ZDOTU)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(cdotc, CDOTC)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(zdotc, ZDOTC)(std::complex* res, const KK_INT* N, const std::complex* x, + const KK_INT* x_inc, const std::complex* y, const KK_INT* y_inc); #endif /// /// axpy /// -void F77_BLAS_MANGLE(saxpy, SAXPY)(const KK_INT* N, const float* alpha, - const float* x, const KK_INT* x_inc, +void F77_BLAS_MANGLE(saxpy, SAXPY)(const KK_INT* N, const float* alpha, const float* x, const KK_INT* x_inc, /* */ float* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(daxpy, DAXPY)(const KK_INT* N, const double* alpha, - const double* x, const KK_INT* x_inc, +void F77_BLAS_MANGLE(daxpy, DAXPY)(const KK_INT* N, const double* alpha, const double* x, const KK_INT* x_inc, /* */ double* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(caxpy, - CAXPY)(const KK_INT* N, const std::complex* alpha, - const std::complex* x, const KK_INT* x_inc, - /* */ std::complex* y, const KK_INT* y_inc); -void F77_BLAS_MANGLE(zaxpy, - ZAXPY)(const KK_INT* N, const std::complex* alpha, - const std::complex* x, const KK_INT* x_inc, - /* */ std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(caxpy, CAXPY)(const KK_INT* N, const std::complex* alpha, const std::complex* x, + const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(zaxpy, ZAXPY)(const KK_INT* N, const std::complex* alpha, const std::complex* x, + const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); /// /// rot /// -void F77_BLAS_MANGLE(srot, SROT)(KK_INT const* N, float* X, KK_INT const* incx, - float* Y, KK_INT const* incy, float* c, +void F77_BLAS_MANGLE(srot, SROT)(KK_INT const* N, float* X, KK_INT const* incx, float* Y, KK_INT const* incy, float* c, float* s); -void F77_BLAS_MANGLE(drot, DROT)(KK_INT const* N, double* X, KK_INT const* incx, - double* Y, KK_INT const* incy, double* c, - double* s); -void F77_BLAS_MANGLE(crot, CROT)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, +void F77_BLAS_MANGLE(drot, DROT)(KK_INT const* N, double* X, KK_INT const* incx, double* Y, KK_INT const* incy, + double* c, double* s); +void F77_BLAS_MANGLE(crot, CROT)(KK_INT const* N, std::complex* X, KK_INT const* incx, std::complex* Y, KK_INT const* incy, float* c, float* s); -void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, +void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, KK_INT const* incx, std::complex* Y, KK_INT const* incy, double* c, double* s); /// @@ -184,106 +134,73 @@ void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, /// void F77_BLAS_MANGLE(srotg, SROTG)(float* a, float* b, float* c, float* s); void F77_BLAS_MANGLE(drotg, DROTG)(double* a, double* b, double* c, double* s); -void F77_BLAS_MANGLE(crotg, CROTG)(std::complex* a, - std::complex* b, float* c, - std::complex* s); -void F77_BLAS_MANGLE(zrotg, ZROTG)(std::complex* a, - std::complex* b, double* c, +void F77_BLAS_MANGLE(crotg, CROTG)(std::complex* a, std::complex* b, float* c, std::complex* s); +void F77_BLAS_MANGLE(zrotg, ZROTG)(std::complex* a, std::complex* b, double* c, std::complex* s); /// /// rotm /// -void F77_BLAS_MANGLE(srotm, SROTM)(const KK_INT* n, float* X, - const KK_INT* incx, float* Y, - const KK_INT* incy, float const* param); -void F77_BLAS_MANGLE(drotm, DROTM)(const KK_INT* n, double* X, - const KK_INT* incx, double* Y, - const KK_INT* incy, double const* param); +void F77_BLAS_MANGLE(srotm, SROTM)(const KK_INT* n, float* X, const KK_INT* incx, float* Y, const KK_INT* incy, + float const* param); +void F77_BLAS_MANGLE(drotm, DROTM)(const KK_INT* n, double* X, const KK_INT* incx, double* Y, const KK_INT* incy, + double const* param); /// /// rotmg /// -void F77_BLAS_MANGLE(srotmg, SROTMG)(float* d1, float* d2, float* x1, - const float* y1, float* param); -void F77_BLAS_MANGLE(drotmg, DROTMG)(double* d1, double* d2, double* x1, - const double* y1, double* param); +void F77_BLAS_MANGLE(srotmg, SROTMG)(float* d1, float* d2, float* x1, const float* y1, float* param); +void F77_BLAS_MANGLE(drotmg, DROTMG)(double* d1, double* d2, double* x1, const double* y1, double* param); /// /// swap /// -void F77_BLAS_MANGLE(sswap, SSWAP)(KK_INT const* N, float* X, - KK_INT const* incx, float* Y, - KK_INT const* incy); -void F77_BLAS_MANGLE(dswap, DSWAP)(KK_INT const* N, double* X, - KK_INT const* incx, double* Y, - KK_INT const* incy); -void F77_BLAS_MANGLE(cswap, CSWAP)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, - KK_INT const* incy); -void F77_BLAS_MANGLE(zswap, ZSWAP)(KK_INT const* N, std::complex* X, - KK_INT const* incx, std::complex* Y, +void F77_BLAS_MANGLE(sswap, SSWAP)(KK_INT const* N, float* X, KK_INT const* incx, float* Y, KK_INT const* incy); +void F77_BLAS_MANGLE(dswap, DSWAP)(KK_INT const* N, double* X, KK_INT const* incx, double* Y, KK_INT const* incy); +void F77_BLAS_MANGLE(cswap, CSWAP)(KK_INT const* N, std::complex* X, KK_INT const* incx, std::complex* Y, KK_INT const* incy); +void F77_BLAS_MANGLE(zswap, ZSWAP)(KK_INT const* N, std::complex* X, KK_INT const* incx, + std::complex* Y, KK_INT const* incy); /// /// Gemv /// -void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, KK_INT*, KK_INT*, const float*, - const float*, KK_INT*, const float*, KK_INT*, - const float*, +void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, const float*, + KK_INT*, const float*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, KK_INT*, KK_INT*, const double*, - const double*, KK_INT*, const double*, +void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, const double*, KK_INT*, const double*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); /// /// Ger /// -void F77_BLAS_MANGLE(sger, SGER)(KK_INT*, KK_INT*, const float*, const float*, - KK_INT*, const float*, KK_INT*, float*, - KK_INT*); -void F77_BLAS_MANGLE(dger, DGER)(KK_INT*, KK_INT*, const double*, const double*, - KK_INT*, const double*, KK_INT*, double*, +void F77_BLAS_MANGLE(sger, SGER)(KK_INT*, KK_INT*, const float*, const float*, KK_INT*, const float*, KK_INT*, float*, KK_INT*); -void F77_BLAS_MANGLE(cgeru, CGERU)(KK_INT*, KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgeru, ZGERU)(KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); -void F77_BLAS_MANGLE(cgerc, CGERC)(KK_INT*, KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgerc, ZGERC)(KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, - std::complex*, KK_INT*); +void F77_BLAS_MANGLE(dger, DGER)(KK_INT*, KK_INT*, const double*, const double*, KK_INT*, const double*, KK_INT*, + double*, KK_INT*); +void F77_BLAS_MANGLE(cgeru, CGERU)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgeru, ZGERU)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); +void F77_BLAS_MANGLE(cgerc, CGERC)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgerc, ZGERC)(KK_INT*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, + const std::complex*, KK_INT*, std::complex*, KK_INT*); /// /// Syr /// -void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, KK_INT*, const float*, - const float*, KK_INT*, float*, KK_INT*); -void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, - const double*, KK_INT*, double*, KK_INT*); +void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, KK_INT*, const float*, const float*, KK_INT*, float*, KK_INT*); +void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, const double*, KK_INT*, double*, KK_INT*); // Although there is a cgeru, there is no csyru // Although there is a zgeru, there is no zsyru // Although there is a cgerc, there is no csyrc, but there is cher (see below) @@ -293,21 +210,17 @@ void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, /// Her /// -void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Syr2 /// -void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, KK_INT*, const float*, - const float*, const KK_INT*, const float*, +void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, KK_INT*, const float*, const float*, const KK_INT*, const float*, KK_INT*, float*, KK_INT*); -void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, - const double*, const KK_INT*, const double*, +void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, const double*, const KK_INT*, const double*, KK_INT*, double*, KK_INT*); // Although there is a cgeru, there is no csyr2u // Although there is a zgeru, there is no zsyr2u @@ -318,58 +231,42 @@ void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, /// Her2 /// -void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, + KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, + KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Trsv /// -void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, - KK_INT*, const float*, KK_INT*, +void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, KK_INT*, const float*, KK_INT*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, - KK_INT*, const double*, KK_INT*, +void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, KK_INT*, const double*, KK_INT*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, - KK_INT*, const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, - KK_INT*, const std::complex*, - KK_INT*, +void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, KK_INT*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Gemm /// -void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const float*, const float*, KK_INT*, - const float*, KK_INT*, const float*, +void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const float*, const float*, + KK_INT*, const float*, KK_INT*, const float*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const double*, const double*, - KK_INT*, const double*, KK_INT*, - const double*, +void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const double*, const double*, + KK_INT*, const double*, KK_INT*, const double*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, - KK_INT*, const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); @@ -377,69 +274,51 @@ void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, /// Herk /// -void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, KK_INT*, KK_INT*, - const float*, const float*, KK_INT*, +void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, const float*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, KK_INT*, KK_INT*, - const double*, const double*, KK_INT*, +void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, const double*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, +void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, - const std::complex*, +void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, const std::complex*, /* */ std::complex*, KK_INT*); /// /// Trmm /// -void F77_BLAS_MANGLE(strmm, STRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const float*, +void F77_BLAS_MANGLE(strmm, STRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dtrmm, DTRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const double*, +void F77_BLAS_MANGLE(dtrmm, DTRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(ctrmm, CTRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ctrmm, CTRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(ztrmm, ZTRMM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ztrmm, ZTRMM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); /// /// Trsm /// -void F77_BLAS_MANGLE(strsm, STRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const float*, +void F77_BLAS_MANGLE(strsm, STRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const float*, const float*, KK_INT*, /* */ float*, KK_INT*); -void F77_BLAS_MANGLE(dtrsm, DTRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, const double*, +void F77_BLAS_MANGLE(dtrsm, DTRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, const double*, const double*, KK_INT*, /* */ double*, KK_INT*); -void F77_BLAS_MANGLE(ctrsm, CTRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ctrsm, CTRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); -void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, - const char*, KK_INT*, KK_INT*, - const std::complex*, - const std::complex*, KK_INT*, +void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, const char*, KK_INT*, KK_INT*, + const std::complex*, const std::complex*, KK_INT*, /* */ std::complex*, KK_INT*); } @@ -447,12 +326,10 @@ void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, /* */ float* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, /* */ double* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(cscal, - CSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); -void F77_BLAS_MANGLE(zscal, - ZSCAL)(const KK_INT* N, const std::complex* alpha, - /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(cscal, CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(zscal, ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); #define F77_FUNC_SSCAL F77_BLAS_MANGLE(sscal, SSCAL) #define F77_FUNC_DSCAL F77_BLAS_MANGLE(dscal, DSCAL) @@ -581,19 +458,17 @@ float HostBlas::asum(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_SASUM(&n, x, &x_inc); } template <> -float HostBlas::dot(KK_INT n, const float* x, KK_INT x_inc, - const float* y, KK_INT y_inc) { +float HostBlas::dot(KK_INT n, const float* x, KK_INT x_inc, const float* y, KK_INT y_inc) { return F77_FUNC_SDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(KK_INT n, const float alpha, const float* x, - KK_INT x_inc, +void HostBlas::axpy(KK_INT n, const float alpha, const float* x, KK_INT x_inc, /* */ float* y, KK_INT y_inc) { F77_FUNC_SAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(KK_INT const N, float* X, KK_INT const incx, float* Y, - KK_INT const incy, float* c, float* s) { +void HostBlas::rot(KK_INT const N, float* X, KK_INT const incx, float* Y, KK_INT const incy, float* c, + float* s) { F77_FUNC_SROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -601,81 +476,67 @@ void HostBlas::rotg(float* a, float* b, float* c, float* s) { F77_FUNC_SROTG(a, b, c, s); } template <> -void HostBlas::rotm(const KK_INT n, float* X, const KK_INT incx, - float* Y, const KK_INT incy, const float* param) { +void HostBlas::rotm(const KK_INT n, float* X, const KK_INT incx, float* Y, const KK_INT incy, + const float* param) { F77_FUNC_SROTM(&n, X, &incx, Y, &incy, param); } template <> -void HostBlas::rotmg(float* d1, float* d2, float* x1, const float* y1, - float* param) { +void HostBlas::rotmg(float* d1, float* d2, float* x1, const float* y1, float* param) { F77_FUNC_SROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(KK_INT const N, float* X, KK_INT const incx, - float* Y, KK_INT const incy) { +void HostBlas::swap(KK_INT const N, float* X, KK_INT const incx, float* Y, KK_INT const incy) { F77_FUNC_SSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, - const float alpha, const float* a, KK_INT lda, +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, const float alpha, const float* a, KK_INT lda, const float* b, KK_INT ldb, const float beta, /* */ float* c, KK_INT ldc) { F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(KK_INT m, KK_INT n, const float alpha, const float* x, - KK_INT incx, const float* y, KK_INT incy, float* a, - KK_INT lda) { +void HostBlas::ger(KK_INT m, KK_INT n, const float alpha, const float* x, KK_INT incx, const float* y, + KK_INT incy, float* a, KK_INT lda) { F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, KK_INT n, const float alpha, - const float* x, KK_INT incx, float* a, KK_INT lda) { +void HostBlas::syr(const char uplo, KK_INT n, const float alpha, const float* x, KK_INT incx, float* a, + KK_INT lda) { F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> -void HostBlas::syr2(const char uplo, KK_INT n, const float alpha, - const float* x, KK_INT incx, const float* y, +void HostBlas::syr2(const char uplo, KK_INT n, const float alpha, const float* x, KK_INT incx, const float* y, KK_INT incy, float* a, KK_INT lda) { F77_FUNC_SSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::trsv(const char uplo, const char transa, const char diag, - KK_INT m, const float* a, KK_INT lda, +void HostBlas::trsv(const char uplo, const char transa, const char diag, KK_INT m, const float* a, KK_INT lda, /* */ float* b, KK_INT ldb) { F77_FUNC_STRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, KK_INT m, - KK_INT n, KK_INT k, const float alpha, - const float* a, KK_INT lda, const float* b, - KK_INT ldb, const float beta, +void HostBlas::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, const float alpha, + const float* a, KK_INT lda, const float* b, KK_INT ldb, const float beta, /* */ float* c, KK_INT ldc) { - F77_FUNC_SGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, - c, &ldc); + F77_FUNC_SGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, KK_INT n, - KK_INT k, const float alpha, const float* a, +void HostBlas::herk(const char transa, const char transb, KK_INT n, KK_INT k, const float alpha, const float* a, KK_INT lda, const float beta, /* */ float* c, KK_INT ldc) { F77_FUNC_SSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> -void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trmm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const float alpha, const float* a, KK_INT lda, /* */ float* b, KK_INT ldb) { - F77_FUNC_STRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_STRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> -void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trsm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const float alpha, const float* a, KK_INT lda, /* */ float* b, KK_INT ldb) { - F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } /// @@ -700,19 +561,17 @@ double HostBlas::asum(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_DASUM(&n, x, &x_inc); } template <> -double HostBlas::dot(KK_INT n, const double* x, KK_INT x_inc, - const double* y, KK_INT y_inc) { +double HostBlas::dot(KK_INT n, const double* x, KK_INT x_inc, const double* y, KK_INT y_inc) { return F77_FUNC_DDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(KK_INT n, const double alpha, const double* x, - KK_INT x_inc, +void HostBlas::axpy(KK_INT n, const double alpha, const double* x, KK_INT x_inc, /* */ double* y, KK_INT y_inc) { F77_FUNC_DAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(KK_INT const N, double* X, KK_INT const incx, - double* Y, KK_INT const incy, double* c, double* s) { +void HostBlas::rot(KK_INT const N, double* X, KK_INT const incx, double* Y, KK_INT const incy, double* c, + double* s) { F77_FUNC_DROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -720,82 +579,67 @@ void HostBlas::rotg(double* a, double* b, double* c, double* s) { F77_FUNC_DROTG(a, b, c, s); } template <> -void HostBlas::rotm(const KK_INT n, double* X, const KK_INT incx, - double* Y, const KK_INT incy, const double* param) { +void HostBlas::rotm(const KK_INT n, double* X, const KK_INT incx, double* Y, const KK_INT incy, + const double* param) { F77_FUNC_DROTM(&n, X, &incx, Y, &incy, param); } template <> -void HostBlas::rotmg(double* d1, double* d2, double* x1, - const double* y1, double* param) { +void HostBlas::rotmg(double* d1, double* d2, double* x1, const double* y1, double* param) { F77_FUNC_DROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(KK_INT const N, double* X, KK_INT const incx, - double* Y, KK_INT const incy) { +void HostBlas::swap(KK_INT const N, double* X, KK_INT const incx, double* Y, KK_INT const incy) { F77_FUNC_DSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, - const double alpha, const double* a, KK_INT lda, +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, const double alpha, const double* a, KK_INT lda, const double* b, KK_INT ldb, const double beta, /* */ double* c, KK_INT ldc) { F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(KK_INT m, KK_INT n, const double alpha, - const double* x, KK_INT incx, const double* y, +void HostBlas::ger(KK_INT m, KK_INT n, const double alpha, const double* x, KK_INT incx, const double* y, KK_INT incy, double* a, KK_INT lda) { F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, KK_INT n, const double alpha, - const double* x, KK_INT incx, double* a, +void HostBlas::syr(const char uplo, KK_INT n, const double alpha, const double* x, KK_INT incx, double* a, KK_INT lda) { F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> -void HostBlas::syr2(const char uplo, KK_INT n, const double alpha, - const double* x, KK_INT incx, const double* y, - KK_INT incy, double* a, KK_INT lda) { +void HostBlas::syr2(const char uplo, KK_INT n, const double alpha, const double* x, KK_INT incx, + const double* y, KK_INT incy, double* a, KK_INT lda) { F77_FUNC_DSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::trsv(const char uplo, const char transa, const char diag, - KK_INT m, const double* a, KK_INT lda, +void HostBlas::trsv(const char uplo, const char transa, const char diag, KK_INT m, const double* a, KK_INT lda, /* */ double* b, KK_INT ldb) { F77_FUNC_DTRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, KK_INT m, - KK_INT n, KK_INT k, const double alpha, - const double* a, KK_INT lda, const double* b, - KK_INT ldb, const double beta, +void HostBlas::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, const double alpha, + const double* a, KK_INT lda, const double* b, KK_INT ldb, const double beta, /* */ double* c, KK_INT ldc) { - F77_FUNC_DGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, - c, &ldc); + F77_FUNC_DGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, KK_INT n, - KK_INT k, const double alpha, const double* a, - KK_INT lda, const double beta, +void HostBlas::herk(const char transa, const char transb, KK_INT n, KK_INT k, const double alpha, + const double* a, KK_INT lda, const double beta, /* */ double* c, KK_INT ldc) { F77_FUNC_DSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> -void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trmm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const double alpha, const double* a, KK_INT lda, /* */ double* b, KK_INT ldb) { - F77_FUNC_DTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_DTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> -void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, +void HostBlas::trsm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, const double alpha, const double* a, KK_INT lda, /* */ double* b, KK_INT ldb) { - F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, - &ldb); + F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } /// @@ -803,34 +647,25 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, /// template <> -void HostBlas >::scal(KK_INT n, - const std::complex alpha, - /* */ std::complex* x, - KK_INT x_inc) { +void HostBlas >::scal(KK_INT n, const std::complex alpha, + /* */ std::complex* x, KK_INT x_inc) { F77_FUNC_CSCAL(&n, &alpha, x, &x_inc); } template <> -KK_INT HostBlas >::iamax(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_ICAMAX(&n, x, &x_inc); } template <> -float HostBlas >::nrm2(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +float HostBlas >::nrm2(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_SCNRM2(&n, x, &x_inc); } template <> -float HostBlas >::asum(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +float HostBlas >::asum(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_SCASUM(&n, x, &x_inc); } template <> -std::complex HostBlas >::dot( - KK_INT n, const std::complex* x, KK_INT x_inc, - const std::complex* y, KK_INT y_inc) { +std::complex HostBlas >::dot(KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) _kk_float2 res = F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); return std::complex(res.vals[0], res.vals[1]); @@ -841,131 +676,99 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(KK_INT n, - const std::complex alpha, - const std::complex* x, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, KK_INT x_inc, - /* */ std::complex* y, - KK_INT y_inc) { + /* */ std::complex* y, KK_INT y_inc) { F77_FUNC_CAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot(KK_INT const N, std::complex* X, - KK_INT const incx, - std::complex* Y, - KK_INT const incy, float* c, - float* s) { +void HostBlas >::rot(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy, float* c, float* s) { F77_FUNC_CROT(&N, X, &incx, Y, &incy, c, s); } template <> -void HostBlas >::rotg(std::complex* a, - std::complex* b, float* c, +void HostBlas >::rotg(std::complex* a, std::complex* b, float* c, std::complex* s) { F77_FUNC_CROTG(a, b, c, s); } template <> -void HostBlas >::swap(KK_INT const N, - std::complex* X, - KK_INT const incx, - std::complex* Y, - KK_INT const incy) { +void HostBlas >::swap(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy) { F77_FUNC_CSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas >::gemv( - const char trans, KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, const std::complex* b, - KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_CGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemv(const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_CGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, + &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::geru( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, (std::complex*)a, - &lda); +void HostBlas >::geru(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, + (std::complex*)a, &lda); } template <> -void HostBlas >::gerc( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, (std::complex*)a, - &lda); +void HostBlas >::gerc(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, + (std::complex*)a, &lda); } template <> template <> -void HostBlas >::her( - const char uplo, KK_INT n, const float alpha, const std::complex* x, - KK_INT incx, std::complex* a, KK_INT lda) { - F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (std::complex*)a, &lda); +void HostBlas >::her(const char uplo, KK_INT n, const float alpha, + const std::complex* x, KK_INT incx, std::complex* a, + KK_INT lda) { + F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::her2( - const char uplo, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, (std::complex*)a, - &lda); +void HostBlas >::her2(const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, + (std::complex*)a, &lda); } template <> -void HostBlas >::trsv(const char uplo, const char transa, - const char diag, KK_INT m, - const std::complex* a, - KK_INT lda, - /* */ std::complex* b, - KK_INT ldb) { - F77_FUNC_CTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, - (std::complex*)b, &ldb); +void HostBlas >::trsv(const char uplo, const char transa, const char diag, KK_INT m, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_CTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> -void HostBlas >::gemm( - const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex* b, KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_CGEMM(&transa, &transb, &m, &n, &k, &alpha, - (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_CGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::herk( - const char transa, const char transb, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_CHERK(&transa, &transb, &n, &k, &alpha, - (const std::complex*)a, &lda, &beta, - (std::complex*)c, &ldc); +void HostBlas >::herk(const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_CHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, + &ldc); } template <> -void HostBlas >::trmm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_CTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); +void HostBlas >::trmm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_CTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); } template <> -void HostBlas >::trsm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); +void HostBlas >::trsm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); } /// @@ -973,34 +776,25 @@ void HostBlas >::trsm( /// template <> -void HostBlas >::scal(KK_INT n, - const std::complex alpha, - /* */ std::complex* x, - KK_INT x_inc) { +void HostBlas >::scal(KK_INT n, const std::complex alpha, + /* */ std::complex* x, KK_INT x_inc) { F77_FUNC_ZSCAL(&n, &alpha, x, &x_inc); } template <> -KK_INT HostBlas >::iamax(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_IZAMAX(&n, x, &x_inc); } template <> -double HostBlas >::nrm2(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +double HostBlas >::nrm2(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_DZNRM2(&n, x, &x_inc); } template <> -double HostBlas >::asum(KK_INT n, - const std::complex* x, - KK_INT x_inc) { +double HostBlas >::asum(KK_INT n, const std::complex* x, KK_INT x_inc) { return F77_FUNC_DZASUM(&n, x, &x_inc); } template <> -std::complex HostBlas >::dot( - KK_INT n, const std::complex* x, KK_INT x_inc, - const std::complex* y, KK_INT y_inc) { +std::complex HostBlas >::dot(KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) _kk_double2 res = F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); return std::complex(res.vals[0], res.vals[1]); @@ -1011,133 +805,100 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(KK_INT n, - const std::complex alpha, - const std::complex* x, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, KK_INT x_inc, - /* */ std::complex* y, - KK_INT y_inc) { + /* */ std::complex* y, KK_INT y_inc) { F77_FUNC_ZAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot( - KK_INT const N, std::complex* X, KK_INT const incx, - std::complex* Y, KK_INT const incy, double* c, double* s) { +void HostBlas >::rot(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy, double* c, double* s) { F77_FUNC_ZROT(&N, X, &incx, Y, &incy, c, s); } template <> -void HostBlas >::rotg(std::complex* a, - std::complex* b, double* c, +void HostBlas >::rotg(std::complex* a, std::complex* b, double* c, std::complex* s) { F77_FUNC_ZROTG(a, b, c, s); } template <> -void HostBlas >::swap(KK_INT const N, - std::complex* X, - KK_INT const incx, - std::complex* Y, - KK_INT const incy) { +void HostBlas >::swap(KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy) { F77_FUNC_ZSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas >::gemv( - const char trans, KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, const std::complex* b, - KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_ZGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemv(const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_ZGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, + &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::geru( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, +void HostBlas >::geru(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> -void HostBlas >::gerc( - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, +void HostBlas >::gerc(KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> template <> -void HostBlas >::her(const char uplo, KK_INT n, - const double alpha, - const std::complex* x, - KK_INT incx, - std::complex* a, +void HostBlas >::her(const char uplo, KK_INT n, const double alpha, + const std::complex* x, KK_INT incx, std::complex* a, KK_INT lda) { - F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (std::complex*)a, &lda); + F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::her2( - const char uplo, KK_INT n, const std::complex alpha, - const std::complex* x, KK_INT incx, const std::complex* y, - KK_INT incy, std::complex* a, KK_INT lda) { - F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, - (const std::complex*)y, &incy, +void HostBlas >::her2(const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> -void HostBlas >::trsv(const char uplo, const char transa, - const char diag, KK_INT m, - const std::complex* a, - KK_INT lda, - /* */ std::complex* b, - KK_INT ldb) { - F77_FUNC_ZTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, - &lda, (std::complex*)b, &ldb); +void HostBlas >::trsv(const char uplo, const char transa, const char diag, KK_INT m, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_ZTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> -void HostBlas >::gemm( - const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex* b, KK_INT ldb, const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_ZGEMM(&transa, &transb, &m, &n, &k, &alpha, - (const std::complex*)a, &lda, - (const std::complex*)b, &ldb, &beta, - (std::complex*)c, &ldc); +void HostBlas >::gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_ZGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::herk( - const char transa, const char transb, KK_INT n, KK_INT k, - const std::complex alpha, const std::complex* a, KK_INT lda, - const std::complex beta, - /* */ std::complex* c, KK_INT ldc) { - F77_FUNC_ZHERK(&transa, &transb, &n, &k, &alpha, - (const std::complex*)a, &lda, &beta, +void HostBlas >::herk(const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { + F77_FUNC_ZHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::trmm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); -} -template <> -void HostBlas >::trsm( - const char side, const char uplo, const char transa, const char diag, - KK_INT m, KK_INT n, const std::complex alpha, - const std::complex* a, KK_INT lda, - /* */ std::complex* b, KK_INT ldb) { - F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, - (const std::complex*)a, &lda, (std::complex*)b, - &ldb); +void HostBlas >::trmm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); +} +template <> +void HostBlas >::trsm(const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { + F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, + (std::complex*)b, &ldb); } } // namespace Impl diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index f7fb3d3978..576fde8471 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -57,66 +57,57 @@ struct HostBlas { static void axpy(KK_INT n, const T alpha, const T *x, KK_INT x_inc, /* */ T *y, KK_INT y_inc); - static void rot(KK_INT const N, T *X, KK_INT const incx, T *Y, - KK_INT const incy, mag_type *c, mag_type *s); + static void rot(KK_INT const N, T *X, KK_INT const incx, T *Y, KK_INT const incy, mag_type *c, mag_type *s); static void rotg(T *a, T *b, mag_type *c, T *s); - static void rotm(const KK_INT n, T *X, const KK_INT incx, T *Y, - const KK_INT incy, T const *param); + static void rotm(const KK_INT n, T *X, const KK_INT incx, T *Y, const KK_INT incy, T const *param); static void rotmg(T *d1, T *d2, T *x1, const T *y1, T *param); - static void swap(KK_INT const N, T *X, KK_INT const incx, T *Y, - KK_INT const incy); + static void swap(KK_INT const N, T *X, KK_INT const incx, T *Y, KK_INT const incy); - static void gemv(const char trans, KK_INT m, KK_INT n, const T alpha, - const T *a, KK_INT lda, const T *b, KK_INT ldb, const T beta, + static void gemv(const char trans, KK_INT m, KK_INT n, const T alpha, const T *a, KK_INT lda, const T *b, KK_INT ldb, + const T beta, /* */ T *c, KK_INT ldc); - static void ger(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, - const T *y, KK_INT incy, T *a, KK_INT lda); + static void ger(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void geru(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, - const T *y, KK_INT incy, T *a, KK_INT lda); + static void geru(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void gerc(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, - const T *y, KK_INT incy, T *a, KK_INT lda); + static void gerc(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void syr(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); + static void syr(const char uplo, KK_INT n, const T alpha, const T *x, KK_INT incx, T *a, KK_INT lda); - static void syr2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void syr2(const char uplo, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); template - static void her(const char uplo, KK_INT n, const tAlpha alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); + static void her(const char uplo, KK_INT n, const tAlpha alpha, const T *x, KK_INT incx, T *a, KK_INT lda); - static void her2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void her2(const char uplo, KK_INT n, const T alpha, const T *x, KK_INT incx, const T *y, KK_INT incy, T *a, + KK_INT lda); - static void trsv(const char uplo, const char transa, const char diag, - KK_INT m, const T *a, KK_INT lda, + static void trsv(const char uplo, const char transa, const char diag, KK_INT m, const T *a, KK_INT lda, /* */ T *b, KK_INT ldb); - static void gemm(const char transa, const char transb, KK_INT m, KK_INT n, - KK_INT k, const T alpha, const T *a, KK_INT lda, const T *b, - KK_INT ldb, const T beta, + static void gemm(const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, const T alpha, const T *a, + KK_INT lda, const T *b, KK_INT ldb, const T beta, /* */ T *c, KK_INT ldc); - static void herk(const char transa, const char transb, KK_INT n, KK_INT k, - const T alpha, const T *a, KK_INT lda, const T beta, + static void herk(const char transa, const char transb, KK_INT n, KK_INT k, const T alpha, const T *a, KK_INT lda, + const T beta, /* */ T *c, KK_INT ldc); - static void trmm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, const T alpha, - const T *a, KK_INT lda, + static void trmm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, + const T alpha, const T *a, KK_INT lda, /* */ T *b, KK_INT ldb); - static void trsm(const char side, const char uplo, const char transa, - const char diag, KK_INT m, KK_INT n, const T alpha, - const T *a, KK_INT lda, + static void trsm(const char side, const char uplo, const char transa, const char diag, KK_INT m, KK_INT n, + const T alpha, const T *a, KK_INT lda, /* */ T *b, KK_INT ldb); }; } // namespace Impl diff --git a/blas/tpls/KokkosBlas_Rocm_tpl.hpp b/blas/tpls/KokkosBlas_Rocm_tpl.hpp index 6f89d349c9..b5a7dabf6f 100644 --- a/blas/tpls/KokkosBlas_Rocm_tpl.hpp +++ b/blas/tpls/KokkosBlas_Rocm_tpl.hpp @@ -25,8 +25,7 @@ namespace Impl { RocBlasSingleton::RocBlasSingleton() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_create_handle(&handle)); - Kokkos::push_finalize_hook( - [&]() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_destroy_handle(handle)); }); + Kokkos::push_finalize_hook([&]() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_destroy_handle(handle)); }); } RocBlasSingleton& RocBlasSingleton::singleton() { diff --git a/blas/tpls/KokkosBlas_tpl_spec.hpp b/blas/tpls/KokkosBlas_tpl_spec.hpp index 0151c0534f..7f40edf435 100644 --- a/blas/tpls/KokkosBlas_tpl_spec.hpp +++ b/blas/tpls/KokkosBlas_tpl_spec.hpp @@ -32,8 +32,7 @@ struct CudaBlasSingleton { static CudaBlasSingleton& singleton(); }; -inline void cublas_internal_error_throw(cublasStatus_t cublasState, - const char* name, const char* file, +inline void cublas_internal_error_throw(cublasStatus_t cublasState, const char* name, const char* file, const int line) { std::ostringstream out; // out << name << " error( " << cublasGetStatusName(cublasState) @@ -43,9 +42,7 @@ inline void cublas_internal_error_throw(cublasStatus_t cublasState, case CUBLAS_STATUS_NOT_INITIALIZED: out << "CUBLAS_STATUS_NOT_INITIALIZED): the library was not initialized."; break; - case CUBLAS_STATUS_ALLOC_FAILED: - out << "CUBLAS_STATUS_ALLOC_FAILED): the resource allocation failed."; - break; + case CUBLAS_STATUS_ALLOC_FAILED: out << "CUBLAS_STATUS_ALLOC_FAILED): the resource allocation failed."; break; case CUBLAS_STATUS_INVALID_VALUE: out << "CUBLAS_STATUS_INVALID_VALUE): an invalid numerical value was " "used as an argument."; @@ -62,9 +59,7 @@ inline void cublas_internal_error_throw(cublasStatus_t cublasState, out << "CUBLAS_STATUS_EXECUTION_FAILED): the GPU program failed to " "execute."; break; - case CUBLAS_STATUS_INTERNAL_ERROR: - out << "CUBLAS_STATUS_INTERNAL_ERROR): an internal operation failed."; - break; + case CUBLAS_STATUS_INTERNAL_ERROR: out << "CUBLAS_STATUS_INTERNAL_ERROR): an internal operation failed."; break; case CUBLAS_STATUS_NOT_SUPPORTED: out << "CUBLAS_STATUS_NOT_SUPPORTED): the feature required is not " "supported."; @@ -77,10 +72,8 @@ inline void cublas_internal_error_throw(cublasStatus_t cublasState, throw std::runtime_error(out.str()); } -inline void cublas_internal_safe_call(cublasStatus_t cublasState, - const char* name, - const char* file = nullptr, - const int line = 0) { +inline void cublas_internal_safe_call(cublasStatus_t cublasState, const char* name, const char* file = nullptr, + const int line = 0) { if (CUBLAS_STATUS_SUCCESS != cublasState) { cublas_internal_error_throw(cublasState, name, file, line); } @@ -89,8 +82,7 @@ inline void cublas_internal_safe_call(cublasStatus_t cublasState, // The macro below defines the interface for the safe cublas calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) \ - KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) /// \brief This function converts KK transpose mode to cuBLAS transpose mode inline cublasOperation_t trans_mode_kk_to_cublas(const char kkMode[]) { @@ -122,8 +114,7 @@ struct RocBlasSingleton { static RocBlasSingleton& singleton(); }; -inline void rocblas_internal_error_throw(rocblas_status rocblasState, - const char* name, const char* file, +inline void rocblas_internal_error_throw(rocblas_status rocblasState, const char* name, const char* file, const int line) { std::ostringstream out; out << name << " error( "; @@ -132,29 +123,19 @@ inline void rocblas_internal_error_throw(rocblas_status rocblasState, out << "rocblas_status_invalid_handle): handle not initialized, invalid " "or null."; break; - case rocblas_status_not_implemented: - out << "rocblas_status_not_implemented): function is not implemented."; - break; - case rocblas_status_invalid_pointer: - out << "rocblas_status_invalid_pointer): invalid pointer argument."; - break; - case rocblas_status_invalid_size: - out << "rocblas_status_invalid_size): invalid size argument."; - break; + case rocblas_status_not_implemented: out << "rocblas_status_not_implemented): function is not implemented."; break; + case rocblas_status_invalid_pointer: out << "rocblas_status_invalid_pointer): invalid pointer argument."; break; + case rocblas_status_invalid_size: out << "rocblas_status_invalid_size): invalid size argument."; break; case rocblas_status_memory_error: out << "rocblas_status_memory_error): failed internal memory allocation, " "copy or dealloc."; break; - case rocblas_status_internal_error: - out << "rocblas_status_internal_error): other internal library failure."; - break; + case rocblas_status_internal_error: out << "rocblas_status_internal_error): other internal library failure."; break; case rocblas_status_perf_degraded: out << "rocblas_status_perf_degraded): performance degraded due to low " "device memory."; break; - case rocblas_status_size_query_mismatch: - out << "unmatched start/stop size query): ."; - break; + case rocblas_status_size_query_mismatch: out << "unmatched start/stop size query): ."; break; case rocblas_status_size_increased: out << "rocblas_status_size_increased): queried device memory size " "increased."; @@ -163,9 +144,7 @@ inline void rocblas_internal_error_throw(rocblas_status rocblasState, out << "rocblas_status_size_unchanged): queried device memory size " "unchanged."; break; - case rocblas_status_invalid_value: - out << "rocblas_status_invalid_value): passed argument not valid."; - break; + case rocblas_status_invalid_value: out << "rocblas_status_invalid_value): passed argument not valid."; break; case rocblas_status_continue: out << "rocblas_status_continue): nothing preventing function to " "proceed."; @@ -182,10 +161,8 @@ inline void rocblas_internal_error_throw(rocblas_status rocblasState, throw std::runtime_error(out.str()); } -inline void rocblas_internal_safe_call(rocblas_status rocblasState, - const char* name, - const char* file = nullptr, - const int line = 0) { +inline void rocblas_internal_safe_call(rocblas_status rocblasState, const char* name, const char* file = nullptr, + const int line = 0) { if (rocblas_status_success != rocblasState) { rocblas_internal_error_throw(rocblasState, name, file, line); } diff --git a/blas/unit_test/Test_Blas1_abs.hpp b/blas/unit_test/Test_Blas1_abs.hpp index 5bf3f55388..eb2d290a6f 100644 --- a/blas/unit_test/Test_Blas1_abs.hpp +++ b/blas/unit_test/Test_Blas1_abs.hpp @@ -32,8 +32,7 @@ void impl_test_abs(int N) { view_stride_adapter y("Y", N); view_stride_adapter org_y("Org_Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -55,8 +54,7 @@ void impl_test_abs(int N) { // Copy result to host (h_y is subview of h_b_y) Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), - eps * AT::abs(x.h_view(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), eps * AT::abs(x.h_view(i))); } // Run with const input // Reset output @@ -64,8 +62,7 @@ void impl_test_abs(int N) { KokkosBlas::abs(y.d_view, x.d_view_const); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), - eps * AT::abs(x.h_view(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), eps * AT::abs(x.h_view(i))); } } @@ -79,8 +76,7 @@ void impl_test_abs_mv(int N, int K) { view_stride_adapter y("Y", N, K); view_stride_adapter org_y("Org_Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -104,8 +100,7 @@ void impl_test_abs_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), - eps * AT::abs(x.h_view(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), eps * AT::abs(x.h_view(i, j))); } } // Test and verify const input @@ -115,8 +110,7 @@ void impl_test_abs_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), - eps * AT::abs(x.h_view(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), eps * AT::abs(x.h_view(i, j))); } } } @@ -125,8 +119,7 @@ void impl_test_abs_mv(int N, int K) { template int test_abs() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_abs(0); @@ -136,8 +129,7 @@ int test_abs() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_abs(0); @@ -146,8 +138,7 @@ int test_abs() { // Test::impl_test_abs(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_abs(0); @@ -156,8 +147,7 @@ int test_abs() { // Test::impl_test_abs(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_abs(1024); Test::impl_test_abs(1024); #endif @@ -168,8 +158,7 @@ int test_abs() { template int test_abs_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_abs_mv(0, 5); @@ -179,8 +168,7 @@ int test_abs_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_abs_mv(0, 5); @@ -189,8 +177,7 @@ int test_abs_mv() { // Test::impl_test_abs_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_abs_mv(0, 5); @@ -199,8 +186,7 @@ int test_abs_mv() { // Test::impl_test_abs_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_abs_mv(1024, 5); Test::impl_test_abs_mv(1024, 5); #endif @@ -209,8 +195,7 @@ int test_abs_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_float"); test_abs(); @@ -224,8 +209,7 @@ TEST_F(TestCategory, abs_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); test_abs(); @@ -239,8 +223,7 @@ TEST_F(TestCategory, abs_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); test_abs, Kokkos::complex, TestDevice>(); @@ -253,9 +236,8 @@ TEST_F(TestCategory, abs_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_int"); test_abs(); diff --git a/blas/unit_test/Test_Blas1_asum.hpp b/blas/unit_test/Test_Blas1_asum.hpp index 65b5b2c063..07cf2e6998 100644 --- a/blas/unit_test/Test_Blas1_asum.hpp +++ b/blas/unit_test/Test_Blas1_asum.hpp @@ -28,8 +28,7 @@ void impl_test_asum(int N) { view_stride_adapter a("A", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -46,8 +45,7 @@ void impl_test_asum(int N) { // parts. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += - MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); + expected_result += MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } typename AT::mag_type nonconst_result = KokkosBlas::asum(a.d_view); @@ -62,8 +60,7 @@ void impl_test_asum(int N) { template int test_asum() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_asum(0); Test::impl_test_asum(13); @@ -72,8 +69,7 @@ int test_asum() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_asum(0); Test::impl_test_asum(13); @@ -81,8 +77,7 @@ int test_asum() { // Test::impl_test_asum(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_asum(0); Test::impl_test_asum(13); @@ -94,8 +89,7 @@ int test_asum() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_float"); test_asum(); @@ -104,8 +98,7 @@ TEST_F(TestCategory, asum_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_double"); test_asum(); @@ -114,8 +107,7 @@ TEST_F(TestCategory, asum_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_complex_double"); test_asum, TestDevice>(); @@ -123,9 +115,8 @@ TEST_F(TestCategory, asum_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_int"); test_asum(); diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 299e18e493..16d6bdc78f 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -34,16 +34,14 @@ void impl_test_axpby(int N) { const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_val = 10; const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) + - Kokkos::ArithTraits::abs(b)) * - max_val * eps; + (static_cast(Kokkos::ArithTraits::abs(a)) + Kokkos::ArithTraits::abs(b)) * max_val * + eps; view_stride_adapter x("X", N); view_stride_adapter y("Y", N); view_stride_adapter org_y("Org_Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -58,8 +56,7 @@ void impl_test_axpby(int N) { KokkosBlas::axpby(a, x.d_view, b, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), - y.h_view(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), y.h_view(i), 2 * max_error); } // Re-randomize y @@ -68,8 +65,7 @@ void impl_test_axpby(int N) { KokkosBlas::axpby(a, x.d_view_const, b, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), - y.h_view(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), y.h_view(i), 2 * max_error); } } @@ -88,12 +84,10 @@ void impl_test_axpby_mv(int N, int K) { const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_val = 10; const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) + - Kokkos::ArithTraits::abs(b)) * - max_val * eps; + (static_cast(Kokkos::ArithTraits::abs(a)) + Kokkos::ArithTraits::abs(b)) * max_val * + eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -114,9 +108,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } @@ -126,9 +118,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } } @@ -137,8 +127,7 @@ void impl_test_axpby_mv(int N, int K) { template int test_axpby() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpby(0); @@ -148,8 +137,7 @@ int test_axpby() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpby(0); @@ -158,8 +146,7 @@ int test_axpby() { Test::impl_test_axpby(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpby(0); @@ -168,8 +155,7 @@ int test_axpby() { Test::impl_test_axpby(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpby(1024); Test::impl_test_axpby(1024); #endif @@ -180,8 +166,7 @@ int test_axpby() { template int test_axpby_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpby_mv(0, 5); @@ -191,8 +176,7 @@ int test_axpby_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpby_mv(0, 5); @@ -201,8 +185,7 @@ int test_axpby_mv() { Test::impl_test_axpby_mv(132231, 5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpby_mv(0, 5); @@ -211,8 +194,7 @@ int test_axpby_mv() { Test::impl_test_axpby_mv(132231, 5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpby_mv(1024, 5); Test::impl_test_axpby_mv(1024, 5); #endif @@ -221,8 +203,7 @@ int test_axpby_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_float"); test_axpby(); @@ -236,8 +217,7 @@ TEST_F(TestCategory, axpby_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double"); test_axpby(); @@ -250,8 +230,7 @@ TEST_F(TestCategory, axpby_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_complex_double"); test_axpby, Kokkos::complex, TestDevice>(); @@ -264,9 +243,8 @@ TEST_F(TestCategory, axpby_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_int"); test_axpby(); @@ -279,8 +257,7 @@ TEST_F(TestCategory, axpby_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpby_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double_int"); test_axpby(); diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 6ce7bad0b1..4f9b394c25 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -76,22 +76,16 @@ constexpr bool isRank0() { return false; } -template -void impl_test_axpby_unification_compare( - tA const& a, tX const& x, tB const& b, tY const& y, int N, - bool testWithNanY, - typename Kokkos::ArithTraits::mag_type const max_val, - typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = Kokkos::ArithTraits::zero(), - tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { - using ScalarTypeX = - typename std::remove_const::type; - using ScalarTypeY = - typename std::remove_const::type; - - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); +template +void impl_test_axpby_unification_compare(tA const& a, tX const& x, tB const& b, tY const& y, int N, bool testWithNanY, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + using ScalarTypeX = typename std::remove_const::type; + using ScalarTypeY = typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarTypeX randStart, randEnd; @@ -121,8 +115,7 @@ void impl_test_axpby_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -136,8 +129,7 @@ void impl_test_axpby_unification_compare( KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); @@ -148,8 +140,7 @@ void impl_test_axpby_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -169,8 +160,7 @@ void impl_test_axpby_unification_compare( valueB = b; KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -189,9 +179,8 @@ void impl_test_axpby_unification_compare( if (testWithNanY == false) { for (int i(0); i < N; ++i) { - EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + - valueB * org_y.h_view(i)), - y.h_view(i), 4. * max_error); + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + valueB * org_y.h_view(i)), y.h_view(i), + 4. * max_error); } } else { // ******************************************************** @@ -220,28 +209,22 @@ void impl_test_axpby_unification_compare( } else { EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); } - EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), - y.h_view(i), 4. * max_error); + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), y.h_view(i), 4. * max_error); } } } -template -void impl_test_axpby_mv_unification_compare( - tA const& a, tX const& x, tB const& b, tY const& y, int N, int K, - bool testWithNanY, - typename Kokkos::ArithTraits::mag_type const max_val, - typename Kokkos::ArithTraits::mag_type const max_error, - tScalarA const inputValueA = Kokkos::ArithTraits::zero(), - tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { - using ScalarTypeX = - typename std::remove_const::type; - using ScalarTypeY = - typename std::remove_const::type; - - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); +template +void impl_test_axpby_mv_unification_compare(tA const& a, tX const& x, tB const& b, tY const& y, int N, int K, + bool testWithNanY, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + using ScalarTypeX = typename std::remove_const::type; + using ScalarTypeY = typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarTypeX randStart, randEnd; @@ -284,8 +267,7 @@ void impl_test_axpby_mv_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -298,8 +280,7 @@ void impl_test_axpby_mv_unification_compare( KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); } } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueA = inputValueA; } else { typename tA::HostMirror h_a("h_A"); @@ -310,8 +291,7 @@ void impl_test_axpby_mv_unification_compare( valueB = b; KokkosBlas::axpby(a, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -329,8 +309,7 @@ void impl_test_axpby_mv_unification_compare( valueB = b; KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); } else if constexpr (isRank0()) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { valueB = inputValueB; } else { typename tB::HostMirror h_b("h_B"); @@ -371,22 +350,18 @@ void impl_test_axpby_mv_unification_compare( << std::endl; #endif vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k) + - b.h_view(b_k) * org_y.h_view(i, k)); + static_cast(a.h_view(a_k) * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); } else { int a_k(a.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } } else { if constexpr (bIsRank1) { (void)valueB; // Avoid "set but not used" error int b_k(b.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = static_cast( - valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + vanillaValue = static_cast(valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); } else { - vanillaValue = static_cast( - valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + vanillaValue = static_cast(valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); } } #if 0 @@ -411,8 +386,7 @@ void impl_test_axpby_mv_unification_compare( if constexpr (aIsRank1) { (void)valueA; // Avoid "set but not used" error int a_k(a.h_view.extent(0) == 1 ? 0 : k); - vanillaValue = - static_cast(a.h_view(a_k) * x.h_view(i, k)); + vanillaValue = static_cast(a.h_view(a_k) * x.h_view(i, k)); #if 0 ScalarTypeY tmp = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); std::cout << "i = " << i @@ -468,9 +442,8 @@ void impl_test_axpby_mv_unification_compare( } } -template +template void impl_test_axpby_unification(int const N) { using ViewTypeAr0 = Kokkos::View; using ViewTypeAr1s_1 = Kokkos::View; @@ -484,10 +457,8 @@ void impl_test_axpby_unification(int const N) { using ViewTypeY = Kokkos::View; - std::array const valuesA{ - -1, Kokkos::ArithTraits::zero(), 1, 3}; - std::array const valuesB{ - -1, Kokkos::ArithTraits::zero(), 1, 5}; + std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -495,9 +466,8 @@ void impl_test_axpby_unification(int const N) { MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); MagnitudeB const max_val = 10; MagnitudeB const max_error = - static_cast( - Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + - Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + static_cast(Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * max_val * eps; // ************************************************************ @@ -518,15 +488,13 @@ void impl_test_axpby_unification(int const N) { a = valueA; b = valueB; - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, true, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -556,14 +524,12 @@ void impl_test_axpby_unification(int const N) { a = valueA; Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, true, max_val, max_error); } } @@ -589,16 +555,13 @@ void impl_test_axpby_unification(int const N) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -622,15 +585,13 @@ void impl_test_axpby_unification(int const N) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -657,15 +618,13 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); b = valueB; - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, true, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + tScalarB, view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); } } } @@ -678,8 +637,7 @@ void impl_test_axpby_unification(int const N) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Starting case 06/16" << std::endl; #endif - if constexpr ((std::is_same_v) || - (std::is_same_v)) { + if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors } else { for (size_t i(0); i < valuesA.size(); ++i) { @@ -694,14 +652,12 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( - a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, true, max_val, max_error); } } @@ -730,17 +686,13 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -768,16 +720,13 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); } } } @@ -802,17 +751,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -839,17 +786,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -875,17 +820,14 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -909,17 +851,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -943,17 +883,14 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, view_stride_adapter, + tScalarB, tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -980,17 +917,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, false, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + impl_test_axpby_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -1015,18 +950,15 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, view_stride_adapter, + tScalarB, view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } @@ -1050,26 +982,22 @@ void impl_test_axpby_unification(int const N) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, false, max_val, max_error); + impl_test_axpby_unification_compare, view_stride_adapter, + tScalarB, view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, true, max_val, + max_error); } } } } } -template +template void impl_test_axpby_mv_unification(int const N, int const K) { // std::cout << "=========================================" << std::endl; // std::cout << "Entering impl_test_axpby_mv_unification()" @@ -1094,10 +1022,8 @@ void impl_test_axpby_mv_unification(int const N, int const K) { using ViewTypeY = Kokkos::View; - std::array const valuesA{ - -1, Kokkos::ArithTraits::zero(), 1, 3}; - std::array const valuesB{ - -1, Kokkos::ArithTraits::zero(), 1, 5}; + std::array const valuesA{-1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{-1, Kokkos::ArithTraits::zero(), 1, 5}; // eps should probably be based on tScalarB since that is the type // in which the result is computed. @@ -1105,9 +1031,8 @@ void impl_test_axpby_mv_unification(int const N, int const K) { MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); MagnitudeB const max_val = 10; MagnitudeB const max_error = - static_cast( - Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + - Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + static_cast(Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * max_val * eps; // ************************************************************ @@ -1128,15 +1053,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( - a, x, b, y, N, K, true, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1163,14 +1086,12 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, true, max_val, max_error); } } @@ -1196,16 +1117,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1239,10 +1157,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1265,15 +1182,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { a = valueA; Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1307,10 +1222,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, tScalarA, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1336,14 +1250,12 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + tScalarB, view_stride_adapter, Device>( a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - tScalarB, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + tScalarB, view_stride_adapter, Device>( a, x, b, y, N, K, true, max_val, max_error); } } @@ -1357,8 +1269,7 @@ void impl_test_axpby_mv_unification(int const N, int const K) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Starting case 08/36" << std::endl; #endif - if constexpr ((std::is_same_v) || - (std::is_same_v)) { + if constexpr ((std::is_same_v) || (std::is_same_v)) { // Avoid the test, due to compilation errors } else { for (size_t i(0); i < valuesA.size(); ++i) { @@ -1373,14 +1284,12 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - ViewTypeBr0, view_stride_adapter, Device>( + impl_test_axpby_mv_unification_compare, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( a, x, b, y, N, K, true, max_val, max_error); } } @@ -1409,17 +1318,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1457,11 +1362,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1488,16 +1391,13 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a, valueA); Kokkos::deep_copy(b.d_base, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); } } } @@ -1535,10 +1435,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, b.h_base); } - impl_test_axpby_mv_unification_compare< - tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + impl_test_axpby_mv_unification_compare, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); } } } @@ -1562,17 +1461,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1599,17 +1496,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1635,17 +1530,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1680,10 +1572,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -1707,16 +1598,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1751,10 +1640,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -1787,17 +1675,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1834,17 +1720,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1880,17 +1764,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -1936,10 +1817,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -1973,16 +1853,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2029,10 +1907,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -2055,17 +1932,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2092,17 +1967,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2128,17 +2001,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2173,10 +2043,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -2200,16 +2069,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, valueA); Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2244,10 +2111,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(b.d_base, b.h_base); } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -2280,17 +2146,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } b = valueB; - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, tScalarB, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2327,17 +2191,15 @@ void impl_test_axpby_mv_unification(int const N, int const K) { Kokkos::deep_copy(a.d_base, a.h_base); } Kokkos::deep_copy(b, valueB); - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, false, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { - impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, ViewTypeBr0, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + impl_test_axpby_mv_unification_compare, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2373,17 +2235,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, - view_stride_adapter, Device>(a, x, b, y, N, K, true, - max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2430,10 +2289,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); } } } @@ -2467,16 +2325,14 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } Kokkos::deep_copy(b.d_base, valueB); impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); if (valueB == Kokkos::ArithTraits::zero()) { impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, true, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); } } } @@ -2523,10 +2379,9 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } impl_test_axpby_mv_unification_compare< - tScalarA, view_stride_adapter, - view_stride_adapter, tScalarB, - view_stride_adapter, view_stride_adapter, - Device>(a, x, b, y, N, K, false, max_val, max_error); + tScalarA, view_stride_adapter, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, Device>(a, x, b, y, N, K, false, max_val, + max_error); } } } @@ -2537,130 +2392,103 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } // namespace Test -template +template int test_axpby_unification() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LLL" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); + Test::impl_test_axpby_unification(14); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-RRR" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>(14); + Test::impl_test_axpby_unification(14); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SSS" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); + Test::impl_test_axpby_unification(14); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SLL" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); + Test::impl_test_axpby_unification(14); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LSS" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); + Test::impl_test_axpby_unification(14); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-SRS" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>(14); + Test::impl_test_axpby_unification(14); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Calling impl_test_axpby_unif(), L-LSR" << std::endl; #endif - Test::impl_test_axpby_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>(14); + Test::impl_test_axpby_unification(14); #endif return 1; } -template +template int test_axpby_mv_unification() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( - 14, numVecsAxpbyTest); + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification(14, numVecsAxpbyTest); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>( - 14, numVecsAxpbyTest); -#endif - -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( - 14, numVecsAxpbyTest); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( - 14, numVecsAxpbyTest); - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( - 14, numVecsAxpbyTest); - - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, - Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>( - 14, numVecsAxpbyTest); - - Test::impl_test_axpby_mv_unification< - tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, - Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>( - 14, numVecsAxpbyTest); + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_axpby_mv_unification(14, numVecsAxpbyTest); + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification(14, + numVecsAxpbyTest); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_float"); test_axpby_unification(); @@ -2674,44 +2502,36 @@ TEST_F(TestCategory, axpby_mv_unification_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double"); test_axpby_unification(); } TEST_F(TestCategory, axpby_mv_unification_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_mv_unification_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_double"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_unification_complex_double"); - test_axpby_unification, Kokkos::complex, - Kokkos::complex, Kokkos::complex, - TestDevice>(); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_complex_double"); + test_axpby_unification, Kokkos::complex, Kokkos::complex, + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_unification_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_mv_unification_complex_double"); - test_axpby_mv_unification, Kokkos::complex, - Kokkos::complex, Kokkos::complex, - TestDevice>(); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_complex_double"); + test_axpby_mv_unification, Kokkos::complex, Kokkos::complex, + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_unification_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_int"); test_axpby_unification(); @@ -2724,17 +2544,14 @@ TEST_F(TestCategory, axpby_mv_unification_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpby_unification_double_int) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_unification_double_int"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double_int"); test_axpby_unification(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_double_mv_unification_int) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::axpby_mv_unification_double_int"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_double_int"); test_axpby_mv_unification(); Kokkos::Profiling::popRegion(); } diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 76528f4a52..94e4260268 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -31,16 +31,13 @@ void impl_test_axpy(int N) { const MagnitudeB max_val = 10; const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + - max_val) * - eps; + (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + max_val) * eps; view_stride_adapter x("X", N); view_stride_adapter y("Y", N); view_stride_adapter org_y("Org_Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -88,12 +85,9 @@ void impl_test_axpy_mv(int N, int K) { const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_val = 10; const MagnitudeB max_error = - (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + - max_val) * - eps; + (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + max_val) * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -113,9 +107,7 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } @@ -125,9 +117,7 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), - y.h_view(i, j), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), y.h_view(i, j), 2 * max_error); } } } @@ -136,8 +126,7 @@ void impl_test_axpy_mv(int N, int K) { template int test_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpy(0); @@ -147,8 +136,7 @@ int test_axpy() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpy(0); @@ -157,8 +145,7 @@ int test_axpy() { // Test::impl_test_axpy(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpy(0); @@ -167,8 +154,7 @@ int test_axpy() { // Test::impl_test_axpy(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpy(1024); Test::impl_test_axpy(1024); #endif @@ -179,8 +165,7 @@ int test_axpy() { template int test_axpy_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_axpy_mv(0, 5); @@ -190,8 +175,7 @@ int test_axpy_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_axpy_mv(0, 5); @@ -200,8 +184,7 @@ int test_axpy_mv() { // Test::impl_test_axpy_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_axpy_mv(0, 5); @@ -210,8 +193,7 @@ int test_axpy_mv() { // Test::impl_test_axpy_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_axpy_mv(1024, 5); Test::impl_test_axpy_mv(1024, 5); #endif @@ -220,8 +202,7 @@ int test_axpy_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_float"); test_axpy(); @@ -235,8 +216,7 @@ TEST_F(TestCategory, axpy_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double"); test_axpy(); @@ -250,8 +230,7 @@ TEST_F(TestCategory, axpy_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_complex_double"); test_axpy, Kokkos::complex, TestDevice>(); @@ -264,9 +243,8 @@ TEST_F(TestCategory, axpy_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_int"); test_axpy(); @@ -279,8 +257,7 @@ TEST_F(TestCategory, axpy_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpy_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double_int"); test_axpy(); diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index 911925476a..3de0fae12d 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -30,8 +30,7 @@ void impl_test_dot(int N) { view_stride_adapter a("a", N); view_stride_adapter b("b", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -48,13 +47,11 @@ void impl_test_dot(int N) { Kokkos::deep_copy(b.h_base, b.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) - expected_result += ats::conj(a.h_view(i)) * b.h_view(i); + for (int i = 0; i < N; i++) expected_result += ats::conj(a.h_view(i)) * b.h_view(i); ScalarA nonconst_nonconst_result = KokkosBlas::dot(a.d_view, b.d_view); - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); ScalarA const_const_result = KokkosBlas::dot(a.d_view_const, b.d_view_const); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); @@ -75,8 +72,7 @@ void impl_test_dot_mv(int N, int K) { view_stride_adapter a("A", N, K); view_stride_adapter b("B", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -95,8 +91,7 @@ void impl_test_dot_mv(int N, int K) { ScalarA* expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) - expected_result[j] += ats::conj(a.h_view(i, j)) * b.h_view(i, j); + for (int i = 0; i < N; i++) expected_result[j] += ats::conj(a.h_view(i, j)) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -107,32 +102,28 @@ void impl_test_dot_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::dot(r, a.d_view_const, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); - EXPECT_NEAR_KK(const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_const_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::dot(r, a.d_view, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); - EXPECT_NEAR_KK(non_const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(non_const_const_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::dot(r, a.d_view_const, b.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -142,8 +133,7 @@ void impl_test_dot_mv(int N, int K) { template int test_dot() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_dot(0); @@ -153,8 +143,7 @@ int test_dot() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_dot(0); @@ -163,8 +152,7 @@ int test_dot() { // Test::impl_test_dot(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_dot(0); @@ -173,8 +161,7 @@ int test_dot() { // Test::impl_test_dot(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_dot(1024); Test::impl_test_dot(1024); #endif @@ -185,8 +172,7 @@ int test_dot() { template int test_dot_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_dot_mv(0, 5); @@ -197,8 +183,7 @@ int test_dot_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_dot_mv(0, 5); @@ -210,8 +195,7 @@ int test_dot_mv() { // Removing the layout stride test as ViewTypeA a("a", N); // is invalid since the view constructor needs a stride object! -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_dot_mv(0, 5); @@ -221,8 +205,7 @@ int test_dot_mv() { // Test::impl_test_dot_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_dot_mv(1024, 5); Test::impl_test_dot_mv(1024, 5); #endif @@ -231,8 +214,7 @@ int test_dot_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_float"); test_dot(); @@ -246,8 +228,7 @@ TEST_F(TestCategory, dot_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_double"); test_dot(); @@ -261,8 +242,7 @@ TEST_F(TestCategory, dot_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_complex_double"); test_dot, Kokkos::complex, TestDevice>(); @@ -275,9 +255,8 @@ TEST_F(TestCategory, dot_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_int"); test_dot(); diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index 49f759958a..94ff8b3ebe 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -29,8 +29,7 @@ void impl_test_iamax(int N) { view_stride_adapter a("X", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -66,11 +65,8 @@ void impl_test_iamax(int N) { { // printf("impl_test_iamax -- return result as a 0-D View on host -- N // %d\n", N); - typedef Kokkos::View - ViewType0D; - ViewType0D r("Iamax::Result 0-D View on host", - typename ViewTypeA::array_layout()); + typedef Kokkos::View ViewType0D; + ViewType0D r("Iamax::Result 0-D View on host", typename ViewTypeA::array_layout()); KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); @@ -85,10 +81,8 @@ void impl_test_iamax(int N) { { // printf("impl_test_iamax -- return result as a 0-D View on device -- N // %d\n", N); - typedef Kokkos::View - ViewType0D; - ViewType0D r("Iamax::Result 0-D View on device", - typename ViewTypeA::array_layout()); + typedef Kokkos::View ViewType0D; + ViewType0D r("Iamax::Result 0-D View on device", typename ViewTypeA::array_layout()); typename ViewType0D::HostMirror h_r = Kokkos::create_mirror_view(r); size_type nonconst_max_loc, const_max_loc; @@ -118,8 +112,7 @@ void impl_test_iamax_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -148,11 +141,8 @@ void impl_test_iamax_mv(int N, int K) { { // printf("impl_test_iamax_mv -- return results as a 1-D View on host -- N // %d\n", N); - Kokkos::View rcontig( - "Iamax::Result View on host", K); - Kokkos::View - r = rcontig; + Kokkos::View rcontig("Iamax::Result View on host", K); + Kokkos::View r = rcontig; KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); @@ -177,10 +167,8 @@ void impl_test_iamax_mv(int N, int K) { // printf("impl_test_iamax_mv -- return results as a 1-D View on device -- N // %d\n", N); Kokkos::View rcontig("Iamax::Result View on host", K); - Kokkos::View r = - rcontig; - typename Kokkos::View::HostMirror h_r = + Kokkos::View r = rcontig; + typename Kokkos::View::HostMirror h_r = Kokkos::create_mirror_view(rcontig); KokkosBlas::iamax(r, a.d_view); @@ -210,8 +198,7 @@ void impl_test_iamax_mv(int N, int K) { template int test_iamax() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_iamax(0); Test::impl_test_iamax(13); @@ -220,8 +207,7 @@ int test_iamax() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_iamax(0); Test::impl_test_iamax(13); @@ -229,8 +215,7 @@ int test_iamax() { // Test::impl_test_iamax(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_iamax(0); Test::impl_test_iamax(13); @@ -244,8 +229,7 @@ int test_iamax() { template int test_iamax_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_iamax_mv(0, 5); Test::impl_test_iamax_mv(13, 5); @@ -254,8 +238,7 @@ int test_iamax_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_iamax_mv(0, 5); Test::impl_test_iamax_mv(13, 5); @@ -263,8 +246,7 @@ int test_iamax_mv() { // Test::impl_test_iamax_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_iamax_mv(0, 5); Test::impl_test_iamax_mv(13, 5); @@ -276,8 +258,7 @@ int test_iamax_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_float"); test_iamax(); @@ -291,8 +272,7 @@ TEST_F(TestCategory, iamax_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_double"); test_iamax(); @@ -306,8 +286,7 @@ TEST_F(TestCategory, iamax_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_complex_double"); test_iamax, TestDevice>(); @@ -320,9 +299,8 @@ TEST_F(TestCategory, iamax_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_int"); test_iamax(); diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index 6555280f0d..f5755982e7 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -36,8 +36,7 @@ void impl_test_mult(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -63,27 +62,21 @@ void impl_test_mult(int N) { KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + - b * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + b * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view_const); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + - b * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + b * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::mult(b, z.d_view, a, x.d_view_const, y.d_view_const); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + - b * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + b * org_z.h_view(i)), z.h_view(i), eps); } } @@ -99,8 +92,7 @@ void impl_test_mult_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -131,9 +123,8 @@ void impl_test_mult_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + - b * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + b * org_z.h_view(i, j)), z.h_view(i, j), + eps); } } @@ -142,9 +133,8 @@ void impl_test_mult_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + - b * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + b * org_z.h_view(i, j)), z.h_view(i, j), + eps); } } } @@ -153,58 +143,43 @@ void impl_test_mult_mv(int N, int K) { template int test_mult() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_mult( - 0); - Test::impl_test_mult( - 13); - Test::impl_test_mult( - 1024); + Test::impl_test_mult(0); + Test::impl_test_mult(13); + Test::impl_test_mult(1024); // Test::impl_test_mult(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_mult( - 0); - Test::impl_test_mult( - 13); - Test::impl_test_mult( - 1024); + Test::impl_test_mult(0); + Test::impl_test_mult(13); + Test::impl_test_mult(1024); // Test::impl_test_mult(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult( - 0); - Test::impl_test_mult( - 13); - Test::impl_test_mult( - 1024); + Test::impl_test_mult(0); + Test::impl_test_mult(13); + Test::impl_test_mult(1024); // Test::impl_test_mult(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult( - 1024); - Test::impl_test_mult( - 1024); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult(1024); + Test::impl_test_mult(1024); #endif return 1; @@ -213,66 +188,50 @@ int test_mult() { template int test_mult_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_mult_mv(0, 5); - Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); // Test::impl_test_mult_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_mult_mv(0, 5); - Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); // Test::impl_test_mult_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult_mv(0, 5); - Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); // Test::impl_test_mult_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult_mv(1024, 5); - Test::impl_test_mult_mv(1024, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(1024, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); test_mult(); @@ -286,8 +245,7 @@ TEST_F(TestCategory, mult_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double"); test_mult(); @@ -301,25 +259,21 @@ TEST_F(TestCategory, mult_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_complex_double"); - test_mult, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_mult, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_complex_double"); - test_mult_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_mult_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_int"); test_mult(); @@ -332,8 +286,7 @@ TEST_F(TestCategory, mult_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, mult_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double_int"); test_mult(); diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index 24795878d1..3942dafe93 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -29,8 +29,7 @@ void impl_test_nrm1(int N) { view_stride_adapter a("a", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -38,10 +37,7 @@ void impl_test_nrm1(int N) { Kokkos::deep_copy(a.h_base, a.d_base); - double eps = (std::is_same::mag_type, - float>::value - ? 1e-4 - : 1e-7); + double eps = (std::is_same::mag_type, float>::value ? 1e-4 : 1e-7); mag_type expected_result = 0; for (int i = 0; i < N; i++) { @@ -50,8 +46,7 @@ void impl_test_nrm1(int N) { // parts. See netlib, MKL, and CUBLAS documentation. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += - MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); + expected_result += MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } mag_type nonconst_result = KokkosBlas::nrm1(a.d_view); @@ -70,8 +65,7 @@ void impl_test_nrm1_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -79,18 +73,13 @@ void impl_test_nrm1_mv(int N, int K) { Kokkos::deep_copy(a.h_base, a.d_base); - double eps = (std::is_same::mag_type, - float>::value - ? 1e-4 - : 1e-7); + double eps = (std::is_same::mag_type, float>::value ? 1e-4 : 1e-7); - Kokkos::View expected_result("Expected Nrm1", - K); + Kokkos::View expected_result("Expected Nrm1", K); for (int k = 0; k < K; k++) { expected_result(k) = MAT::zero(); for (int i = 0; i < N; i++) { - expected_result(k) += MAT::abs(AT::real(a.h_view(i, k))) + - MAT::abs(AT::imag(a.h_view(i, k))); + expected_result(k) += MAT::abs(AT::real(a.h_view(i, k))) + MAT::abs(AT::imag(a.h_view(i, k))); } } @@ -109,8 +98,7 @@ void impl_test_nrm1_mv(int N, int K) { template int test_nrm1() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm1(0); Test::impl_test_nrm1(13); @@ -119,8 +107,7 @@ int test_nrm1() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm1(0); Test::impl_test_nrm1(13); @@ -128,8 +115,7 @@ int test_nrm1() { Test::impl_test_nrm1(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm1(0); Test::impl_test_nrm1(13); @@ -143,8 +129,7 @@ int test_nrm1() { template int test_nrm1_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); @@ -154,8 +139,7 @@ int test_nrm1_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); @@ -164,8 +148,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(132231, 5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); @@ -178,8 +161,7 @@ int test_nrm1_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_float"); test_nrm1(); @@ -193,8 +175,7 @@ TEST_F(TestCategory, nrm1_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_double"); test_nrm1(); @@ -208,8 +189,7 @@ TEST_F(TestCategory, nrm1_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_complex_double"); test_nrm1, TestDevice>(); @@ -222,9 +202,8 @@ TEST_F(TestCategory, nrm1_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_int"); test_nrm1(); diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index a9b3f7c10f..556d48f753 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -27,8 +27,7 @@ void impl_test_nrm2(int N) { view_stride_adapter a("a", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -42,8 +41,7 @@ void impl_test_nrm2(int N) { for (int i = 0; i < N; i++) { expected_result += AT::abs(a.h_view(i)) * AT::abs(a.h_view(i)); } - expected_result = - Kokkos::ArithTraits::sqrt(expected_result); + expected_result = Kokkos::ArithTraits::sqrt(expected_result); typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); @@ -59,8 +57,7 @@ void impl_test_nrm2_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -74,8 +71,7 @@ void impl_test_nrm2_mv(int N, int K) { for (int i = 0; i < N; i++) { expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); } - expected_result[j] = - Kokkos::ArithTraits::sqrt(expected_result[j]); + expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -86,8 +82,7 @@ void impl_test_nrm2_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::nrm2(r, a.d_view_const); @@ -104,8 +99,7 @@ void impl_test_nrm2_mv(int N, int K) { template int test_nrm2() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2(0); Test::impl_test_nrm2(13); @@ -114,8 +108,7 @@ int test_nrm2() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2(0); Test::impl_test_nrm2(13); @@ -123,8 +116,7 @@ int test_nrm2() { // Test::impl_test_nrm2(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2(0); Test::impl_test_nrm2(13); @@ -138,8 +130,7 @@ int test_nrm2() { template int test_nrm2_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); @@ -149,8 +140,7 @@ int test_nrm2_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); @@ -159,8 +149,7 @@ int test_nrm2_mv() { // Test::impl_test_nrm2_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); @@ -173,8 +162,7 @@ int test_nrm2_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_float"); test_nrm2(); @@ -188,8 +176,7 @@ TEST_F(TestCategory, nrm2_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_double"); test_nrm2(); @@ -203,8 +190,7 @@ TEST_F(TestCategory, nrm2_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_complex_double"); test_nrm2, TestDevice>(); @@ -217,9 +203,8 @@ TEST_F(TestCategory, nrm2_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_int"); test_nrm2(); diff --git a/blas/unit_test/Test_Blas1_nrm2_squared.hpp b/blas/unit_test/Test_Blas1_nrm2_squared.hpp index 09e4b3d45d..d718626f8e 100644 --- a/blas/unit_test/Test_Blas1_nrm2_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2_squared.hpp @@ -27,8 +27,7 @@ void impl_test_nrm2_squared(int N) { view_stride_adapter a("a", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -57,8 +56,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); @@ -84,10 +82,8 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -95,10 +91,8 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -109,8 +103,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { template int test_nrm2_squared() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2_squared(0); Test::impl_test_nrm2_squared(13); @@ -119,8 +112,7 @@ int test_nrm2_squared() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2_squared(0); Test::impl_test_nrm2_squared(13); @@ -128,8 +120,7 @@ int test_nrm2_squared() { // Test::impl_test_nrm2_squared(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared(0); Test::impl_test_nrm2_squared(13); @@ -143,8 +134,7 @@ int test_nrm2_squared() { template int test_nrm2_squared_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); @@ -154,8 +144,7 @@ int test_nrm2_squared_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); @@ -164,8 +153,7 @@ int test_nrm2_squared_mv() { // Test::impl_test_nrm2_squared_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); @@ -178,8 +166,7 @@ int test_nrm2_squared_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_float"); test_nrm2_squared(); @@ -193,8 +180,7 @@ TEST_F(TestCategory, nrm2_squared_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_double"); test_nrm2_squared(); @@ -208,25 +194,21 @@ TEST_F(TestCategory, nrm2_squared_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2_squared_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_complex_double"); test_nrm2_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2_squared_mv_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_complex_double"); test_nrm2_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_int"); test_nrm2_squared(); diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index 48d8676fe4..6dcc01bf17 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -31,11 +31,9 @@ void impl_test_nrm2w(int N) { constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); - const MagnitudeA max_error = - max_val * std::sqrt(static_cast(N)) * eps; + const MagnitudeA max_error = max_val * std::sqrt(static_cast(N)) * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -51,8 +49,7 @@ void impl_test_nrm2w(int N) { typename AT::mag_type term = AT::abs(a.h_view(i)) / AT::abs(w.h_view(i)); expected_result += term * term; } - expected_result = - Kokkos::ArithTraits::sqrt(expected_result); + expected_result = Kokkos::ArithTraits::sqrt(expected_result); typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a.d_view, w.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); @@ -69,11 +66,9 @@ void impl_test_nrm2w_mv(int N, int K) { constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); - const MagnitudeA max_error = - max_val * std::sqrt(static_cast(N)) * eps; + const MagnitudeA max_error = max_val * std::sqrt(static_cast(N)) * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -88,12 +83,10 @@ void impl_test_nrm2w_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = - AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); + typename AT::mag_type term = AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } - expected_result[j] = - Kokkos::ArithTraits::sqrt(expected_result[j]); + expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } Kokkos::View r("Dot::Result", K); @@ -112,8 +105,7 @@ void impl_test_nrm2w_mv(int N, int K) { template int test_nrm2w() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w(0); Test::impl_test_nrm2w(13); @@ -122,8 +114,7 @@ int test_nrm2w() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w(0); Test::impl_test_nrm2w(13); @@ -131,8 +122,7 @@ int test_nrm2w() { // Test::impl_test_nrm2(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w(0); Test::impl_test_nrm2w(13); @@ -146,8 +136,7 @@ int test_nrm2w() { template int test_nrm2w_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w_mv(0, 5); Test::impl_test_nrm2w_mv(13, 5); @@ -157,8 +146,7 @@ int test_nrm2w_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w_mv(0, 5); Test::impl_test_nrm2w_mv(13, 5); @@ -167,8 +155,7 @@ int test_nrm2w_mv() { // Test::impl_test_nrm2w_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w_mv(0, 5); Test::impl_test_nrm2w_mv(13, 5); @@ -181,8 +168,7 @@ int test_nrm2w_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_float"); test_nrm2w(); @@ -196,8 +182,7 @@ TEST_F(TestCategory, nrm2w_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_double"); test_nrm2w(); @@ -211,8 +196,7 @@ TEST_F(TestCategory, nrm2w_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_complex_double"); test_nrm2w, TestDevice>(); @@ -225,9 +209,8 @@ TEST_F(TestCategory, nrm2w_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_int"); test_nrm2w(); diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 5a55d15fad..42bcdb0848 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -33,8 +33,7 @@ void impl_test_nrm2w_squared(int N) { const MagnitudeA eps = AT::epsilon(); const MagnitudeA max_error = max_val * max_val * N * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -51,8 +50,7 @@ void impl_test_nrm2w_squared(int N) { expected_result += term * term; } - typename AT::mag_type nonconst_result = - KokkosBlas::nrm2w_squared(a.d_view, w.d_view); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a.d_view, w.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } @@ -69,8 +67,7 @@ void impl_test_nrm2w_squared_mv(int N, int K) { const MagnitudeA eps = AT::epsilon(); const MagnitudeA max_error = max_val * max_val * N * eps; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); @@ -84,8 +81,7 @@ void impl_test_nrm2w_squared_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = - AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); + typename AT::mag_type term = AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } } @@ -106,8 +102,7 @@ void impl_test_nrm2w_squared_mv(int N, int K) { template int test_nrm2w_squared() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w_squared(0); Test::impl_test_nrm2w_squared(13); @@ -116,8 +111,7 @@ int test_nrm2w_squared() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w_squared(0); Test::impl_test_nrm2w_squared(13); @@ -125,8 +119,7 @@ int test_nrm2w_squared() { // Test::impl_test_nrm2(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w_squared(0); Test::impl_test_nrm2w_squared(13); @@ -140,8 +133,7 @@ int test_nrm2w_squared() { template int test_nrm2w_squared_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrm2w_squared_mv(0, 5); Test::impl_test_nrm2w_squared_mv(13, 5); @@ -151,8 +143,7 @@ int test_nrm2w_squared_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrm2w_squared_mv(0, 5); Test::impl_test_nrm2w_squared_mv(13, 5); @@ -161,8 +152,7 @@ int test_nrm2w_squared_mv() { // Test::impl_test_nrm2w_squared_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2w_squared_mv(0, 5); Test::impl_test_nrm2w_squared_mv(13, 5); @@ -175,8 +165,7 @@ int test_nrm2w_squared_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_float"); test_nrm2w_squared(); @@ -190,8 +179,7 @@ TEST_F(TestCategory, nrm2w_squared_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_double"); test_nrm2w_squared(); @@ -205,25 +193,21 @@ TEST_F(TestCategory, nrm2w_squared_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2w_squared_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_complex_double"); test_nrm2w_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::nrm2w_squared_mv_complex_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_complex_double"); test_nrm2w_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_int"); test_nrm2w_squared(); diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 91cc1c7502..e4a9101e85 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -27,8 +27,7 @@ void impl_test_nrminf(int N) { view_stride_adapter a("A", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -38,11 +37,9 @@ void impl_test_nrminf(int N) { double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - typename AT::mag_type expected_result = - Kokkos::ArithTraits::min(); + typename AT::mag_type expected_result = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) - if (AT::abs(a.h_view(i)) > expected_result) - expected_result = AT::abs(a.h_view(i)); + if (AT::abs(a.h_view(i)) > expected_result) expected_result = AT::abs(a.h_view(i)); if (N == 0) expected_result = typename AT::mag_type(0); @@ -60,8 +57,7 @@ void impl_test_nrminf_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -73,8 +69,7 @@ void impl_test_nrminf_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { - if (AT::abs(a.h_view(i, j)) > expected_result[j]) - expected_result[j] = AT::abs(a.h_view(i, j)); + if (AT::abs(a.h_view(i, j)) > expected_result[j]) expected_result[j] = AT::abs(a.h_view(i, j)); } if (N == 0) expected_result[j] = typename AT::mag_type(0); } @@ -103,8 +98,7 @@ void impl_test_nrminf_mv(int N, int K) { template int test_nrminf() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrminf(0); Test::impl_test_nrminf(13); @@ -113,8 +107,7 @@ int test_nrminf() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrminf(0); Test::impl_test_nrminf(13); @@ -122,8 +115,7 @@ int test_nrminf() { // Test::impl_test_nrminf(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrminf(0); Test::impl_test_nrminf(13); @@ -137,8 +129,7 @@ int test_nrminf() { template int test_nrminf_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_nrminf_mv(0, 5); Test::impl_test_nrminf_mv(13, 5); @@ -147,8 +138,7 @@ int test_nrminf_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_nrminf_mv(0, 5); Test::impl_test_nrminf_mv(13, 5); @@ -156,8 +146,7 @@ int test_nrminf_mv() { // Test::impl_test_nrminf_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrminf_mv(0, 5); Test::impl_test_nrminf_mv(13, 5); @@ -169,8 +158,7 @@ int test_nrminf_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_float"); test_nrminf(); @@ -184,8 +172,7 @@ TEST_F(TestCategory, nrminf_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_double"); test_nrminf(); @@ -199,8 +186,7 @@ TEST_F(TestCategory, nrminf_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_complex_double"); test_nrminf, TestDevice>(); @@ -213,9 +199,8 @@ TEST_F(TestCategory, nrminf_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_int"); test_nrminf(); diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index c293fa04eb..2b8a07a552 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -36,8 +36,7 @@ void impl_test_reciprocal(int N) { view_stride_adapter x("X", N); view_stride_adapter y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -71,14 +70,12 @@ void impl_test_reciprocal_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; Test::getRandomBounds(10, randStart, randEnd); - Kokkos::fill_random(x.d_view, rand_pool, - Kokkos::ArithTraits::one(), randEnd); + Kokkos::fill_random(x.d_view, rand_pool, Kokkos::ArithTraits::one(), randEnd); } Kokkos::deep_copy(x.h_base, x.d_base); @@ -88,10 +85,8 @@ void impl_test_reciprocal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int j = 0; j < K; ++j) { for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK( - y.h_view(i, j), - Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), - 2 * Kokkos::ArithTraits::epsilon()); + EXPECT_NEAR_KK(y.h_view(i, j), Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); } } @@ -102,10 +97,8 @@ void impl_test_reciprocal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int j = 0; j < K; j++) { for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK( - y.h_view(i, j), - Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), - 2 * Kokkos::ArithTraits::epsilon()); + EXPECT_NEAR_KK(y.h_view(i, j), Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); } } } @@ -114,8 +107,7 @@ void impl_test_reciprocal_mv(int N, int K) { template int test_reciprocal() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_reciprocal(0); @@ -125,8 +117,7 @@ int test_reciprocal() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_reciprocal(0); @@ -135,8 +126,7 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_reciprocal(0); @@ -145,8 +135,7 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_reciprocal(1024); Test::impl_test_reciprocal(1024); #endif @@ -157,57 +146,47 @@ int test_reciprocal() { template int test_reciprocal_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); - Test::impl_test_reciprocal_mv(1024, - 5); + Test::impl_test_reciprocal_mv(1024, 5); // Test::impl_test_reciprocal_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); - Test::impl_test_reciprocal_mv(1024, - 5); + Test::impl_test_reciprocal_mv(1024, 5); // Test::impl_test_reciprocal_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); - Test::impl_test_reciprocal_mv(1024, - 5); + Test::impl_test_reciprocal_mv(1024, 5); // Test::impl_test_reciprocal_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal_mv(1024, - 5); - Test::impl_test_reciprocal_mv(1024, - 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal_mv(1024, 5); + Test::impl_test_reciprocal_mv(1024, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_float"); test_reciprocal(); @@ -221,8 +200,7 @@ TEST_F(TestCategory, reciprocal_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_double"); test_reciprocal(); @@ -236,26 +214,21 @@ TEST_F(TestCategory, reciprocal_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_complex_double"); - test_reciprocal, Kokkos::complex, - TestDevice>(); + test_reciprocal, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_complex_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::reciprocal_mv_complex_double"); - test_reciprocal_mv, Kokkos::complex, - TestDevice>(); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_complex_double"); + test_reciprocal_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_int"); test_reciprocal(); diff --git a/blas/unit_test/Test_Blas1_rot.hpp b/blas/unit_test/Test_Blas1_rot.hpp index ab1f395923..db9367cb42 100644 --- a/blas/unit_test/Test_Blas1_rot.hpp +++ b/blas/unit_test/Test_Blas1_rot.hpp @@ -71,8 +71,7 @@ int test_rot() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot(); @@ -81,8 +80,7 @@ TEST_F(TestCategory, rot_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot(); @@ -91,8 +89,7 @@ TEST_F(TestCategory, rot_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot, TestDevice>(); @@ -101,8 +98,7 @@ TEST_F(TestCategory, rot_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); test_rot, TestDevice>(); diff --git a/blas/unit_test/Test_Blas1_rotg.hpp b/blas/unit_test/Test_Blas1_rotg.hpp index 27f9c3cf71..31945ba6d9 100644 --- a/blas/unit_test/Test_Blas1_rotg.hpp +++ b/blas/unit_test/Test_Blas1_rotg.hpp @@ -17,8 +17,7 @@ namespace Test { template -void test_rotg_impl(typename Device::execution_space const& space, - Scalar const a_in, Scalar const b_in) { +void test_rotg_impl(typename Device::execution_space const& space, Scalar const a_in, Scalar const b_in) { using magnitude_type = typename Kokkos::ArithTraits::mag_type; using SViewType = Kokkos::View; using MViewType = Kokkos::View; @@ -59,8 +58,7 @@ int test_rotg() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg(); @@ -69,8 +67,7 @@ TEST_F(TestCategory, rotg_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg(); @@ -79,8 +76,7 @@ TEST_F(TestCategory, rotg_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg, TestDevice>(); @@ -89,8 +85,7 @@ TEST_F(TestCategory, rotg_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); test_rotg, TestDevice>(); diff --git a/blas/unit_test/Test_Blas1_rotm.hpp b/blas/unit_test/Test_Blas1_rotm.hpp index 1f41fd06bc..e1a7cddb3c 100644 --- a/blas/unit_test/Test_Blas1_rotm.hpp +++ b/blas/unit_test/Test_Blas1_rotm.hpp @@ -18,8 +18,7 @@ namespace Test { template -void set_rotm_inputs(const int &test_case, vector_view_type &X, - vector_view_type &Y, param_view_type ¶m, +void set_rotm_inputs(const int &test_case, vector_view_type &X, vector_view_type &Y, param_view_type ¶m, vector_ref_type &Xref, vector_ref_type &Yref) { // Initialize X and Y inputs typename vector_view_type::HostMirror X_h = Kokkos::create_mirror_view(X); @@ -37,8 +36,7 @@ void set_rotm_inputs(const int &test_case, vector_view_type &X, Kokkos::deep_copy(Y, Y_h); // Initialize Xref, Yref and param (test case dependent) - typename param_view_type::HostMirror param_h = - Kokkos::create_mirror_view(param); + typename param_view_type::HostMirror param_h = Kokkos::create_mirror_view(param); switch (test_case) { case 0: param_h(0) = -2.0; @@ -116,8 +114,7 @@ void set_rotm_inputs(const int &test_case, vector_view_type &X, } template -void check_results(vector_view_type &X, vector_view_type &Y, - vector_ref_type &Xref, vector_ref_type &Yref) { +void check_results(vector_view_type &X, vector_view_type &Y, vector_ref_type &Xref, vector_ref_type &Yref) { using Scalar = typename vector_view_type::value_type; typename vector_view_type::HostMirror X_h = Kokkos::create_mirror_view(X); @@ -162,8 +159,7 @@ int test_rotm() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); test_rotm(); @@ -172,8 +168,7 @@ TEST_F(TestCategory, rotm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); test_rotm(); diff --git a/blas/unit_test/Test_Blas1_rotmg.hpp b/blas/unit_test/Test_Blas1_rotmg.hpp index ecfc3b6815..0fb3c5f67e 100644 --- a/blas/unit_test/Test_Blas1_rotmg.hpp +++ b/blas/unit_test/Test_Blas1_rotmg.hpp @@ -17,8 +17,7 @@ namespace Test { template -void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, - RView& ref_vals) { +void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, RView& ref_vals) { using scalar_type = typename View0::non_const_value_type; using YView = typename View0::const_type; @@ -28,10 +27,8 @@ void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, const scalar_type eps = Kokkos::ArithTraits::eps(); const scalar_type tol = -#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_MKL) - 100 * - eps; // Guessing MKL implements sin/cos differently so need larger tol +#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + 100 * eps; // Guessing MKL implements sin/cos differently so need larger tol #else 10 * eps; #endif @@ -61,8 +58,7 @@ void test_rotmg_impl(View0& d1, View0& d2, View0& x1, View0& y1, PView& param, } template -void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, - View0& x1, View0& y1, PView& param, +void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, View0& x1, View0& y1, PView& param, RView& ref_vals) { constexpr double gamma = 4096; Kokkos::deep_copy(param, 0.0); @@ -211,9 +207,7 @@ void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, ref_vals(7) = -0.25; ref_vals(8) = 0.0; break; - default: - throw std::runtime_error("rotmg test: test case unrecognized!"); - break; + default: throw std::runtime_error("rotmg test: test case unrecognized!"); break; } } } // namespace Test @@ -222,8 +216,7 @@ template int test_rotmg() { Kokkos::View d1("d1"), d2("d2"), x1("x1"), y1("y1"); Kokkos::View param("param"); - Kokkos::View ref_vals( - "reference values"); + Kokkos::View ref_vals("reference values"); constexpr int num_test_cases = 9; for (int test_case = 0; test_case < num_test_cases; ++test_case) { diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index a88ed646f1..b0169095fd 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -33,8 +33,7 @@ void impl_test_scal(int N) { view_stride_adapter x("X", N); view_stride_adapter y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -68,8 +67,7 @@ void impl_test_scal_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -89,8 +87,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), eps); } } @@ -100,8 +97,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), eps); } } @@ -113,16 +109,14 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(param_j, ScalarA(3 + j)); } - auto h_params = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params); + auto h_params = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params); Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); KokkosBlas::scal(y.d_view, params, x.d_view); Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), - y.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), y.h_view(i, j), eps); } } @@ -131,8 +125,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), - y.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), y.h_view(i, j), eps); } } } @@ -141,8 +134,7 @@ void impl_test_scal_mv(int N, int K) { template int test_scal() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_scal(0); @@ -152,8 +144,7 @@ int test_scal() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_scal(0); @@ -162,8 +153,7 @@ int test_scal() { // Test::impl_test_scal(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_scal(0); @@ -172,8 +162,7 @@ int test_scal() { // Test::impl_test_scal(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_scal(1024); Test::impl_test_scal(1024); #endif @@ -184,8 +173,7 @@ int test_scal() { template int test_scal_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_scal_mv(0, 5); @@ -195,8 +183,7 @@ int test_scal_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_scal_mv(0, 5); @@ -205,8 +192,7 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_scal_mv(0, 5); @@ -215,8 +201,7 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_scal_mv(1024, 5); Test::impl_test_scal_mv(1024, 5); #endif @@ -225,8 +210,7 @@ int test_scal_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_float"); test_scal(); @@ -240,8 +224,7 @@ TEST_F(TestCategory, scal_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double"); test_scal(); @@ -255,8 +238,7 @@ TEST_F(TestCategory, scal_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_complex_double"); test_scal, Kokkos::complex, TestDevice>(); @@ -269,9 +251,8 @@ TEST_F(TestCategory, scal_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_int"); test_scal(); @@ -284,8 +265,7 @@ TEST_F(TestCategory, scal_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, scal_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double_int"); test_scal(); diff --git a/blas/unit_test/Test_Blas1_serial_setscal.hpp b/blas/unit_test/Test_Blas1_serial_setscal.hpp index cfbe4d602d..31ad998ac4 100644 --- a/blas/unit_test/Test_Blas1_serial_setscal.hpp +++ b/blas/unit_test/Test_Blas1_serial_setscal.hpp @@ -34,15 +34,13 @@ enum : int { BlasSet = 0, BlasScale = 1 }; struct KokkosKernelTag {}; struct NaiveTag {}; -template +template struct Functor_TestBlasSerialMatUtil { ScalarType _alpha; ViewType _a; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} + Functor_TestBlasSerialMatUtil(const ScalarType alpha, const ViewType &a) : _alpha(alpha), _a(a) {} KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const int i) const { @@ -76,27 +74,20 @@ struct Functor_TestBlasSerialMatUtil { typedef typename ViewType::value_type value_type; std::string name_region("KokkosBlas::Test::SerialMatUtil"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BlasSet ? "Set" - : TestID == BlasScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = (TestID == BlasSet ? "Set" : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy - policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return 0; } }; -template +template void impl_test_blas_matutil(const int N, const int BlkSize) { /// typedefs typedef typename ViewType::value_type value_type; @@ -107,8 +98,7 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { ViewType a("a", N, BlkSize, BlkSize); ViewType b("b", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); @@ -116,12 +106,8 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b, a); /// test body - Functor_TestBlasSerialMatUtil(alpha, a) - .run(); - Functor_TestBlasSerialMatUtil(alpha, b) - .run(); + Functor_TestBlasSerialMatUtil(alpha, a).run(); + Functor_TestBlasSerialMatUtil(alpha, b).run(); Kokkos::fence(); @@ -133,44 +119,31 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b_host, b); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); + for (int j = 0; j < BlkSize; ++j) EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); } } // namespace Test -template +template int test_blas_matutil() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::impl_test_blas_matutil(0, - 10); - Test::impl_test_blas_matutil(10, - 15); - Test::impl_test_blas_matutil(1024, - 9); - Test::impl_test_blas_matutil( - 132231, 3); + typedef Kokkos::View ViewType; + Test::impl_test_blas_matutil(0, 10); + Test::impl_test_blas_matutil(10, 15); + Test::impl_test_blas_matutil(1024, 9); + Test::impl_test_blas_matutil(132231, 3); } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::impl_test_blas_matutil(0, - 10); - Test::impl_test_blas_matutil(10, - 15); - Test::impl_test_blas_matutil(1024, - 9); - Test::impl_test_blas_matutil( - 132231, 3); + typedef Kokkos::View ViewType; + Test::impl_test_blas_matutil(0, 10); + Test::impl_test_blas_matutil(10, 15); + Test::impl_test_blas_matutil(1024, 9); + Test::impl_test_blas_matutil(132231, 3); } #endif @@ -201,19 +174,15 @@ TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_dcomplex) { - test_blas_matutil, - Kokkos::complex, ::Test::BlasSet>(); + test_blas_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_dcomplex) { - test_blas_matutil, - Kokkos::complex, ::Test::BlasScale>(); + test_blas_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_double) { - test_blas_matutil, double, - ::Test::BlasSet>(); + test_blas_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_double) { - test_blas_matutil, double, - ::Test::BlasScale>(); + test_blas_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index 34d52a7e4a..6d7ae3818e 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -26,8 +26,7 @@ void impl_test_sum(int N) { view_stride_adapter a("A", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -53,8 +52,7 @@ void impl_test_sum_mv(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); @@ -76,8 +74,7 @@ void impl_test_sum_mv(int N, int K) { Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], eps * expected_result[k]); } KokkosBlas::sum(r, a.d_view_const); @@ -94,8 +91,7 @@ void impl_test_sum_mv(int N, int K) { template int test_sum() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_sum(0); Test::impl_test_sum(13); @@ -104,8 +100,7 @@ int test_sum() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_sum(0); Test::impl_test_sum(13); @@ -113,8 +108,7 @@ int test_sum() { // Test::impl_test_sum(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_sum(0); Test::impl_test_sum(13); @@ -128,8 +122,7 @@ int test_sum() { template int test_sum_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); @@ -139,8 +132,7 @@ int test_sum_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); @@ -149,8 +141,7 @@ int test_sum_mv() { // Test::impl_test_sum_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); @@ -163,8 +154,7 @@ int test_sum_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_float"); test_sum(); @@ -178,8 +168,7 @@ TEST_F(TestCategory, sum_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_double"); test_sum(); @@ -193,8 +182,7 @@ TEST_F(TestCategory, sum_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_complex_double"); test_sum, TestDevice>(); @@ -207,9 +195,8 @@ TEST_F(TestCategory, sum_mv_complex_double) { } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_int"); test_sum(); diff --git a/blas/unit_test/Test_Blas1_swap.hpp b/blas/unit_test/Test_Blas1_swap.hpp index 624552f1dc..15a04c652c 100644 --- a/blas/unit_test/Test_Blas1_swap.hpp +++ b/blas/unit_test/Test_Blas1_swap.hpp @@ -55,8 +55,7 @@ int test_swap() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_float"); test_swap(); @@ -65,8 +64,7 @@ TEST_F(TestCategory, swap_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_double"); test_swap(); @@ -75,8 +73,7 @@ TEST_F(TestCategory, swap_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_float"); test_swap, TestDevice>(); @@ -85,8 +82,7 @@ TEST_F(TestCategory, swap_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_double"); test_swap, TestDevice>(); diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index eca7657b55..0f78731ab3 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -47,8 +47,7 @@ void impl_test_team_abs(int N) { view_stride_adapter x("X", N); view_stride_adapter y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); @@ -56,52 +55,36 @@ void impl_test_team_abs(int N) { Kokkos::deep_copy(x.h_base, x.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) - expected_result += AT::abs(x.h_view(i)) * AT::abs(x.h_view(i)); + for (int i = 0; i < N; i++) expected_result += AT::abs(x.h_view(i)) * AT::abs(x.h_view(i)); // KokkosBlas::abs(y,x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); // Zero out y and run again with const input Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); @@ -134,8 +117,7 @@ void impl_test_team_abs_mv(int N, int K) { ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(x.h_view(i, j)) * AT::abs(x.h_view(i, j)); + for (int i = 0; i < N; i++) expected_result[j] += AT::abs(x.h_view(i, j)) * AT::abs(x.h_view(i, j)); } // double eps = std::is_same::value?2*1e-5:1e-7; @@ -147,21 +129,17 @@ void impl_test_team_abs_mv(int N, int K) { // KokkosBlas::abs(y,x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA nonconst_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_result - expected_result[k]) / divisor; + ScalarA nonconst_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); // EXPECT_NEAR_KK( nonconst_result, expected_result[k], // eps*expected_result[k]); @@ -172,21 +150,17 @@ void impl_test_team_abs_mv(int N, int K) { // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAbs", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA const_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_result - expected_result[k]) / divisor; + ScalarA const_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); // EXPECT_NEAR_KK( const_result, expected_result[k], // eps*expected_result[k]); @@ -199,8 +173,7 @@ void impl_test_team_abs_mv(int N, int K) { template int test_team_abs() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_abs(0); @@ -210,8 +183,7 @@ int test_team_abs() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_abs(0); @@ -220,8 +192,7 @@ int test_team_abs() { // Test::impl_test_team_abs(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_abs(0); @@ -230,8 +201,7 @@ int test_team_abs() { // Test::impl_test_team_abs(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_abs(124); Test::impl_test_team_abs(124); #endif @@ -242,8 +212,7 @@ int test_team_abs() { template int test_team_abs_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_abs_mv(0, 5); @@ -254,8 +223,7 @@ int test_team_abs_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_abs_mv(0, 5); @@ -265,8 +233,7 @@ int test_team_abs_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_abs_mv(0, 5); @@ -276,8 +243,7 @@ int test_team_abs_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_abs_mv(124, 5); Test::impl_test_team_abs_mv(124, 5); #endif @@ -286,46 +252,31 @@ int test_team_abs_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_abs_float) { - test_team_abs(); -} -TEST_F(TestCategory, team_abs_mv_float) { - test_team_abs_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_abs_float) { test_team_abs(); } +TEST_F(TestCategory, team_abs_mv_float) { test_team_abs_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_abs_double) { - test_team_abs(); -} -TEST_F(TestCategory, team_abs_mv_double) { - test_team_abs_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_abs_double) { test_team_abs(); } +TEST_F(TestCategory, team_abs_mv_double) { test_team_abs_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_complex_double) { test_team_abs, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_abs_mv_complex_double) { - test_team_abs_mv, Kokkos::complex, - TestDevice>(); + test_team_abs_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_int) { test_team_abs(); } -TEST_F(TestCategory, team_abs_mv_int) { - test_team_abs_mv(); -} +TEST_F(TestCategory, team_abs_mv_int) { test_team_abs_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index 5875f2bc1f..cadb2d0d09 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -60,57 +60,40 @@ void impl_test_team_axpby(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarB(a * x.h_view(i) + b * y.h_view(i)) * - ScalarB(a * x.h_view(i) + b * y.h_view(i)); + expected_result += ScalarB(a * x.h_view(i) + b * y.h_view(i)) * ScalarB(a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( teamMember, a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = - KokkosBlas::dot(y.d_view_const, y.d_view_const); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -146,8 +129,8 @@ void impl_test_team_axpby_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)) * - ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)); + expected_result[j] += + ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)) * ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -158,40 +141,32 @@ void impl_test_team_axpby_mv(int N, int K) { // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpby( - teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(AT::abs(nonconst_nonconst_result), - AT::abs(expected_result[k]), - AT::abs(expected_result[k] * eps)); + EXPECT_NEAR_KK(AT::abs(nonconst_nonconst_result), AT::abs(expected_result[k]), AT::abs(expected_result[k] * eps)); } Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpby", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpby( - teamMember, a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(AT::abs(const_non_const_result), AT::abs(expected_result[k]), - AT::abs(eps * expected_result[k])); + EXPECT_NEAR_KK(AT::abs(const_non_const_result), AT::abs(expected_result[k]), AT::abs(eps * expected_result[k])); } delete[] expected_result; @@ -201,8 +176,7 @@ void impl_test_team_axpby_mv(int N, int K) { template int test_team_axpby() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpby(0); @@ -212,8 +186,7 @@ int test_team_axpby() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpby(0); @@ -222,8 +195,7 @@ int test_team_axpby() { // Test::impl_test_team_axpby(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpby(0); @@ -232,8 +204,7 @@ int test_team_axpby() { // Test::impl_test_team_axpby(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpby(124); Test::impl_test_team_axpby(124); #endif @@ -244,8 +215,7 @@ int test_team_axpby() { template int test_team_axpby_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpby_mv(0, 5); @@ -256,8 +226,7 @@ int test_team_axpby_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpby_mv(0, 5); @@ -267,8 +236,7 @@ int test_team_axpby_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpby_mv(0, 5); @@ -278,8 +246,7 @@ int test_team_axpby_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpby_mv(124, 5); Test::impl_test_team_axpby_mv(124, 5); #endif @@ -288,59 +255,36 @@ int test_team_axpby_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpby_float) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_mv_float) { - test_team_axpby_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpby_float) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_mv_float) { test_team_axpby_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpby_double) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_mv_double) { - test_team_axpby_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpby_double) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_mv_double) { test_team_axpby_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_complex_double) { - test_team_axpby, Kokkos::complex, - TestDevice>(); + test_team_axpby, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_axpby_mv_complex_double) { - test_team_axpby_mv, Kokkos::complex, - TestDevice>(); + test_team_axpby_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpby_int) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_mv_int) { - test_team_axpby_mv(); -} +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpby_int) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_mv_int) { test_team_axpby_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_axpby_double_int) { - test_team_axpby(); -} -TEST_F(TestCategory, team_axpby_double_mv_int) { - test_team_axpby_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_axpby_double_int) { test_team_axpby(); } +TEST_F(TestCategory, team_axpby_double_mv_int) { test_team_axpby_mv(); } #endif #endif // check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_axpy.hpp b/blas/unit_test/Test_Blas1_team_axpy.hpp index a5ac6a9c66..de2bf78855 100644 --- a/blas/unit_test/Test_Blas1_team_axpy.hpp +++ b/blas/unit_test/Test_Blas1_team_axpy.hpp @@ -48,8 +48,7 @@ void impl_test_team_axpy(int N) { ScalarA a = 3; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -60,55 +59,38 @@ void impl_test_team_axpy(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarB(a * x.h_view(i) + y.h_view(i)) * - ScalarB(a * x.h_view(i) + y.h_view(i)); + expected_result += ScalarB(a * x.h_view(i) + y.h_view(i)) * ScalarB(a * x.h_view(i) + y.h_view(i)); // KokkosBlas::axpy(a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( teamMember, a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = - KokkosBlas::dot(y.d_view_const, y.d_view_const); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -143,8 +125,7 @@ void impl_test_team_axpy_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * x.h_view(i, j) + y.h_view(i, j)) * - ScalarB(a * x.h_view(i, j) + y.h_view(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j) + y.h_view(i, j)) * ScalarB(a * x.h_view(i, j) + y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -153,39 +134,32 @@ void impl_test_team_axpy_mv(int N, int K) { // KokkosBlas::axpy(a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpy( - teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamAxpy", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::axpy( - teamMember, a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -195,8 +169,7 @@ void impl_test_team_axpy_mv(int N, int K) { template int test_team_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpy(0); @@ -206,8 +179,7 @@ int test_team_axpy() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpy(0); @@ -216,8 +188,7 @@ int test_team_axpy() { // Test::impl_test_team_axpy(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpy(0); @@ -226,8 +197,7 @@ int test_team_axpy() { // Test::impl_test_team_axpy(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpy(124); Test::impl_test_team_axpy(124); #endif @@ -238,8 +208,7 @@ int test_team_axpy() { template int test_team_axpy_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_axpy_mv(0, 5); @@ -250,8 +219,7 @@ int test_team_axpy_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_axpy_mv(0, 5); @@ -261,8 +229,7 @@ int test_team_axpy_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_axpy_mv(0, 5); @@ -272,8 +239,7 @@ int test_team_axpy_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_axpy_mv(124, 5); Test::impl_test_team_axpy_mv(124, 5); #endif @@ -282,57 +248,36 @@ int test_team_axpy_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpy_float) { - test_team_axpy(); -} -TEST_F(TestCategory, team_axpy_mv_float) { - test_team_axpy_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpy_float) { test_team_axpy(); } +TEST_F(TestCategory, team_axpy_mv_float) { test_team_axpy_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpy_double) { - test_team_axpy(); -} -TEST_F(TestCategory, team_axpy_mv_double) { - test_team_axpy_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_axpy_double) { test_team_axpy(); } +TEST_F(TestCategory, team_axpy_mv_double) { test_team_axpy_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_complex_double) { - test_team_axpy, Kokkos::complex, - TestDevice>(); + test_team_axpy, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_axpy_mv_complex_double) { - test_team_axpy_mv, Kokkos::complex, - TestDevice>(); + test_team_axpy_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_int) { test_team_axpy(); } -TEST_F(TestCategory, team_axpy_mv_int) { - test_team_axpy_mv(); -} +TEST_F(TestCategory, team_axpy_mv_int) { test_team_axpy_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_axpy_double_int) { - test_team_axpy(); -} -TEST_F(TestCategory, team_axpy_double_mv_int) { - test_team_axpy_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_axpy_double_int) { test_team_axpy(); } +TEST_F(TestCategory, team_axpy_double_mv_int) { test_team_axpy_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_dot.hpp b/blas/unit_test/Test_Blas1_team_dot.hpp index 26baf261fe..9445d5784d 100644 --- a/blas/unit_test/Test_Blas1_team_dot.hpp +++ b/blas/unit_test/Test_Blas1_team_dot.hpp @@ -61,47 +61,32 @@ void impl_test_team_dot(int N) { ScalarA nonconst_nonconst_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_nonconst_result += r(k); double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); ScalarA const_const_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_const_result += r(k); @@ -112,21 +97,14 @@ void impl_test_team_dot(int N) { ScalarA nonconst_const_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_const_result += r(k); @@ -137,21 +115,14 @@ void impl_test_team_dot(int N) { ScalarA const_nonconst_result = 0; Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, - Kokkos::subview( - a.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - b.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(a.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(b.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_nonconst_result += r(k); @@ -185,8 +156,7 @@ void impl_test_team_dot_mv(int N, int K) { ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) - expected_result[j] += a.h_view(i, j) * b.h_view(i, j); + for (int i = 0; i < N; i++) expected_result[j] += a.h_view(i, j) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -196,66 +166,54 @@ void impl_test_team_dot_mv(int N, int K) { // KokkosBlas::dot(r,a,b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::dot(r,c_a,c_b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); - EXPECT_NEAR_KK(const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_const_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::dot(r,a,c_b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); - EXPECT_NEAR_KK(non_const_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(non_const_const_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::dot(r,c_a,b); Kokkos::parallel_for( - "KokkosBlas::Test::TeamDot", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), - Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -265,8 +223,7 @@ void impl_test_team_dot_mv(int N, int K) { template int test_team_dot() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_dot(0); @@ -276,8 +233,7 @@ int test_team_dot() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_dot(0); @@ -286,8 +242,7 @@ int test_team_dot() { // Test::impl_test_team_dot(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_dot(0); @@ -296,8 +251,7 @@ int test_team_dot() { // Test::impl_test_team_dot(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_dot(124); Test::impl_test_team_dot(124); #endif @@ -308,8 +262,7 @@ int test_team_dot() { template int test_team_dot_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_dot_mv(0, 5); @@ -320,8 +273,7 @@ int test_team_dot_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_dot_mv(0, 5); @@ -331,8 +283,7 @@ int test_team_dot_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_dot_mv(0, 5); @@ -342,8 +293,7 @@ int test_team_dot_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_dot_mv(124, 5); Test::impl_test_team_dot_mv(124, 5); #endif @@ -352,46 +302,31 @@ int test_team_dot_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_dot_float) { - test_team_dot(); -} -TEST_F(TestCategory, team_dot_mv_float) { - test_team_dot_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_dot_float) { test_team_dot(); } +TEST_F(TestCategory, team_dot_mv_float) { test_team_dot_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_dot_double) { - test_team_dot(); -} -TEST_F(TestCategory, team_dot_mv_double) { - test_team_dot_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_dot_double) { test_team_dot(); } +TEST_F(TestCategory, team_dot_mv_double) { test_team_dot_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_complex_double) { test_team_dot, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_dot_mv_complex_double) { - test_team_dot_mv, Kokkos::complex, - TestDevice>(); + test_team_dot_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_int) { test_team_dot(); } -TEST_F(TestCategory, team_dot_mv_int) { - test_team_dot_mv(); -} +TEST_F(TestCategory, team_dot_mv_int) { test_team_dot_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index 488e9ccf51..63fdbf99c1 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -65,63 +65,41 @@ void impl_test_team_mult(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)) * - ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)); + expected_result += ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)) * + ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)); // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( teamMember, b, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); // Reset z on device to orig and run again with const-valued y Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( teamMember, b, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); @@ -130,27 +108,17 @@ void impl_test_team_mult(int N) { Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,c_x,c_y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( teamMember, b, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), - Kokkos::subview( - y.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); @@ -195,19 +163,16 @@ void impl_test_team_mult_mv(int N, int K) { // Since b and a are known and the largest value in z, x and y // is set by the variables max_val, the error upper bound will be // max_error = a * max_val * max_val - typename Kokkos::ArithTraits::mag_type const eps = - Kokkos::ArithTraits::epsilon(); + typename Kokkos::ArithTraits::mag_type const eps = Kokkos::ArithTraits::epsilon(); typename Kokkos::ArithTraits::mag_type const max_error = Kokkos::ArithTraits::abs(a) * max_val * max_val * eps; // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, - x.d_view, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, x.d_view, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(z.h_base, z.d_base); @@ -224,12 +189,10 @@ void impl_test_team_mult_mv(int N, int K) { Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( - "KokkosBlas::Test::TeamMult", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, - x.d_view, Kokkos::subview(y.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, x.d_view, + Kokkos::subview(y.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(z.h_base, z.d_base); @@ -245,58 +208,43 @@ void impl_test_team_mult_mv(int N, int K) { template int test_team_mult() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_mult(0); - Test::impl_test_team_mult(13); - Test::impl_test_team_mult(124); + Test::impl_test_team_mult(0); + Test::impl_test_team_mult(13); + Test::impl_test_team_mult(124); // Test::impl_test_team_mult(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_mult(0); - Test::impl_test_team_mult(13); - Test::impl_test_team_mult(124); + Test::impl_test_team_mult(0); + Test::impl_test_team_mult(13); + Test::impl_test_team_mult(124); // Test::impl_test_team_mult(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_mult(0); - Test::impl_test_team_mult(13); - Test::impl_test_team_mult(124); + Test::impl_test_team_mult(0); + Test::impl_test_team_mult(13); + Test::impl_test_team_mult(124); // Test::impl_test_team_mult(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_mult(124); - Test::impl_test_team_mult(124); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_mult(124); + Test::impl_test_team_mult(124); #endif return 1; @@ -305,117 +253,79 @@ int test_team_mult() { template int test_team_mult_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_mult_mv(0, 5); - Test::impl_test_team_mult_mv(13, 5); - Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(0, 5); + Test::impl_test_team_mult_mv(13, 5); + Test::impl_test_team_mult_mv(124, 5); // Test::impl_test_team_mult_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_mult_mv(0, 5); - Test::impl_test_team_mult_mv(13, 5); - Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(0, 5); + Test::impl_test_team_mult_mv(13, 5); + Test::impl_test_team_mult_mv(124, 5); // Test::impl_test_team_mult_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_mult_mv(0, 5); - Test::impl_test_team_mult_mv(13, 5); - Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(0, 5); + Test::impl_test_team_mult_mv(13, 5); + Test::impl_test_team_mult_mv(124, 5); // Test::impl_test_team_mult_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_mult_mv(124, 5); - Test::impl_test_team_mult_mv(124, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_mult_mv(124, 5); + Test::impl_test_team_mult_mv(124, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_mult_float) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_mv_float) { - test_team_mult_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_mult_float) { test_team_mult(); } +TEST_F(TestCategory, team_mult_mv_float) { test_team_mult_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_mult_double) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_mv_double) { - test_team_mult_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_mult_double) { test_team_mult(); } +TEST_F(TestCategory, team_mult_mv_double) { test_team_mult_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_complex_double) { - test_team_mult, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_mult, Kokkos::complex, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_mult_mv_complex_double) { - test_team_mult_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_mult_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_mult_int) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_mv_int) { - test_team_mult_mv(); -} +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_mult_int) { test_team_mult(); } +TEST_F(TestCategory, team_mult_mv_int) { test_team_mult_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_mult_double_int) { - test_team_mult(); -} -TEST_F(TestCategory, team_mult_double_mv_int) { - test_team_mult_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_mult_double_int) { test_team_mult(); } +TEST_F(TestCategory, team_mult_double_mv_int) { test_team_mult_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index 12192032c9..befec6e57b 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -49,10 +49,8 @@ void impl_test_team_nrm2(int N, int K) { typename AT::mag_type *expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); - for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); - expected_result[j] = - Kokkos::ArithTraits::sqrt(expected_result[j]); + for (int i = 0; i < N; i++) expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); + expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -62,26 +60,22 @@ void impl_test_team_nrm2(int N, int K) { // KokkosBlas::nrm2(r,a); Kokkos::parallel_for( - "KokkosBlas::Test::TeamNrm2", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId)); + d_r(teamId) = KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], eps * expected_result[k]); } // KokkosBlas::nrm2(r,c_a); Kokkos::parallel_for( - "KokkosBlas::Test::TeamNrm2", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId)); + d_r(teamId) = + KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -96,8 +90,7 @@ void impl_test_team_nrm2(int N, int K) { template int test_team_nrm2() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; Test::impl_test_team_nrm2(0, 5); Test::impl_test_team_nrm2(13, 5); @@ -106,8 +99,7 @@ int test_team_nrm2() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; Test::impl_test_team_nrm2(0, 5); Test::impl_test_team_nrm2(13, 5); @@ -115,8 +107,7 @@ int test_team_nrm2() { // Test::impl_test_team_nrm2(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_team_nrm2(0, 5); Test::impl_test_team_nrm2(13, 5); @@ -128,28 +119,22 @@ int test_team_nrm2() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_float) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_double) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_complex_double) { - test_team_nrm2, TestDevice>(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_nrm2_complex_double) { test_team_nrm2, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_int) { test_team_nrm2(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index 212b1e09e9..f3d6707ba3 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -62,60 +62,42 @@ void impl_test_team_scal(int N) { } Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - typename AT::mag_type divisor = - AT::abs(expected_result) == zero ? one : AT::abs(expected_result); - typename AT::mag_type diff = - AT::abs(nonconst_nonconst_result - expected_result) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); + typename AT::mag_type diff = AT::abs(nonconst_nonconst_result - expected_result) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( teamMember, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); - typename AT::mag_type divisor = - AT::abs(expected_result) == zero ? one : AT::abs(expected_result); - typename AT::mag_type diff = - AT::abs(const_nonconst_result - expected_result) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); + typename AT::mag_type diff = AT::abs(const_nonconst_result - expected_result) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } } @@ -147,8 +129,7 @@ void impl_test_team_scal_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += - ScalarB(a * x.h_view(i, j)) * ScalarB(a * x.h_view(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j)) * ScalarB(a * x.h_view(i, j)); } } @@ -159,21 +140,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::View r("Dot::Result", K); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, - Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_scalar_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_scalar_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_scalar_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -181,21 +158,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA const_scalar_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_scalar_result - expected_result[k]) / divisor; + ScalarA const_scalar_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_scalar_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -211,8 +184,7 @@ void impl_test_team_scal_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += ScalarB((3.0 + j) * x.h_view(i, j)) * - ScalarB((3.0 + j) * x.h_view(i, j)); + expected_result[j] += ScalarB((3.0 + j) * x.h_view(i, j)) * ScalarB((3.0 + j) * x.h_view(i, j)); } } @@ -220,21 +192,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - params(teamId), Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), params(teamId), + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_vector_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_vector_result - expected_result[k]) / divisor; + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(nonconst_vector_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -242,22 +210,17 @@ void impl_test_team_scal_mv(int N, int K) { Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( - "KokkosBlas::Test::TeamScal", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), - params(teamId), - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), params(teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { - ScalarA const_vector_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_vector_result - expected_result[k]) / divisor; + ScalarA const_vector_result = r(k); + typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); + typename AT::mag_type diff = AT::abs(const_vector_result - expected_result[k]) / divisor; EXPECT_NEAR_KK(diff, zero, eps); } @@ -268,8 +231,7 @@ void impl_test_team_scal_mv(int N, int K) { template int test_team_scal() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_scal(0); @@ -279,8 +241,7 @@ int test_team_scal() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_scal(0); @@ -289,8 +250,7 @@ int test_team_scal() { // Test::impl_test_team_scal(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_scal(0); @@ -299,8 +259,7 @@ int test_team_scal() { // Test::impl_test_team_scal(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_scal(124); Test::impl_test_team_scal(124); #endif @@ -311,8 +270,7 @@ int test_team_scal() { template int test_team_scal_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; Test::impl_test_team_scal_mv(0, 5); @@ -323,8 +281,7 @@ int test_team_scal_mv() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; Test::impl_test_team_scal_mv(0, 5); @@ -334,8 +291,7 @@ int test_team_scal_mv() { // Device>(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; Test::impl_test_team_scal_mv(0, 5); @@ -345,8 +301,7 @@ int test_team_scal_mv() { // Device>(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) Test::impl_test_team_scal_mv(124, 5); Test::impl_test_team_scal_mv(124, 5); #endif @@ -355,57 +310,36 @@ int test_team_scal_mv() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_scal_float) { - test_team_scal(); -} -TEST_F(TestCategory, team_scal_mv_float) { - test_team_scal_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_scal_float) { test_team_scal(); } +TEST_F(TestCategory, team_scal_mv_float) { test_team_scal_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_scal_double) { - test_team_scal(); -} -TEST_F(TestCategory, team_scal_mv_double) { - test_team_scal_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_scal_double) { test_team_scal(); } +TEST_F(TestCategory, team_scal_mv_double) { test_team_scal_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_complex_double) { - test_team_scal, Kokkos::complex, - TestDevice>(); + test_team_scal, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_scal_mv_complex_double) { - test_team_scal_mv, Kokkos::complex, - TestDevice>(); + test_team_scal_mv, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_int) { test_team_scal(); } -TEST_F(TestCategory, team_scal_mv_int) { - test_team_scal_mv(); -} +TEST_F(TestCategory, team_scal_mv_int) { test_team_scal_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_scal_double_int) { - test_team_scal(); -} -TEST_F(TestCategory, team_scal_double_mv_int) { - test_team_scal_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_scal_double_int) { test_team_scal(); } +TEST_F(TestCategory, team_scal_double_mv_int) { test_team_scal_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_setscal.hpp b/blas/unit_test/Test_Blas1_team_setscal.hpp index 4d2499a466..33b264aa79 100644 --- a/blas/unit_test/Test_Blas1_team_setscal.hpp +++ b/blas/unit_test/Test_Blas1_team_setscal.hpp @@ -33,35 +33,27 @@ enum : int { BlasSet = 0, BlasScale = 1 }; struct KokkosKernelTag {}; struct NaiveTag {}; -template +template struct Functor_TestBlasTeamMatUtil { using execution_space = typename DeviceType::execution_space; ScalarType _alpha; ViewType _a; KOKKOS_INLINE_FUNCTION - Functor_TestBlasTeamMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} + Functor_TestBlasTeamMatUtil(const ScalarType alpha, const ViewType &a) : _alpha(alpha), _a(a) {} template - KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const MemberType &member) const { const int i = member.league_rank(); auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); switch (TestID) { - case BlasSet: - KokkosBlas::TeamSet::invoke(member, _alpha, A); - break; - case BlasScale: - KokkosBlas::TeamScale::invoke(member, _alpha, A); - break; + case BlasSet: KokkosBlas::TeamSet::invoke(member, _alpha, A); break; + case BlasScale: KokkosBlas::TeamScale::invoke(member, _alpha, A); break; } } template - KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, const MemberType &member) const { if (member.team_rank() == 0) { const int k = member.league_rank(); auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); @@ -85,21 +77,15 @@ struct Functor_TestBlasTeamMatUtil { typedef typename ViewType::value_type value_type; std::string name_region("KokkosBlas::Test::SerialMatUtil"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BlasSet ? "Set" - : TestID == BlasScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = (TestID == BlasSet ? "Set" : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); @@ -107,8 +93,7 @@ struct Functor_TestBlasTeamMatUtil { } }; -template +template void impl_test_blas_matutil(const int N, const int BlkSize) { /// typedefs typedef typename ViewType::value_type value_type; @@ -119,8 +104,7 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { ViewType a("a", N, BlkSize, BlkSize); ViewType b("b", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(a, random, value_type(1.0)); Kokkos::fence(); @@ -128,12 +112,8 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b, a); /// test body - Functor_TestBlasTeamMatUtil(alpha, a) - .run(); - Functor_TestBlasTeamMatUtil(alpha, b) - .run(); + Functor_TestBlasTeamMatUtil(alpha, a).run(); + Functor_TestBlasTeamMatUtil(alpha, b).run(); Kokkos::fence(); @@ -145,45 +125,32 @@ void impl_test_blas_matutil(const int N, const int BlkSize) { Kokkos::deep_copy(b_host, b); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); + for (int j = 0; j < BlkSize; ++j) EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); } } // namespace TeamMatUtil } // namespace Test -template +template int test_blas_team_matutil() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_blas_matutil(0, 10); - Test::TeamMatUtil::impl_test_blas_matutil(10, 15); - Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); - Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + typedef Kokkos::View ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_blas_matutil(0, 10); - Test::TeamMatUtil::impl_test_blas_matutil(10, 15); - Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); - Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + typedef Kokkos::View ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); } #endif @@ -214,19 +181,15 @@ TEST_F(TestCategory, blas_scalar_team_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_team_set_dcomplex_dcomplex) { - test_blas_team_matutil, - Kokkos::complex, ::Test::BlasSet>(); + test_blas_team_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_dcomplex) { - test_blas_team_matutil, - Kokkos::complex, ::Test::BlasScale>(); + test_blas_team_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_team_set_dcomplex_double) { - test_blas_team_matutil, double, - ::Test::BlasSet>(); + test_blas_team_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_double) { - test_blas_team_matutil, double, - ::Test::BlasScale>(); + test_blas_team_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index cfc76455f3..27765b0936 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -66,64 +66,42 @@ void impl_test_team_update(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)) * - ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)); + expected_result += ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)) * + ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::update(a,x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( teamMember, a, - Kokkos::subview( - x.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); @@ -131,28 +109,18 @@ void impl_test_team_update(int N) { Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,c_y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( teamMember, a, - Kokkos::subview( - x.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(x.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, - Kokkos::subview( - y.d_view_const, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + Kokkos::subview(y.d_view_const, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, - Kokkos::subview( - z.d_view, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + Kokkos::subview(z.d_view, Kokkos::make_pair(teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); @@ -196,10 +164,8 @@ void impl_test_team_update_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarC(); for (int i = 0; i < N; i++) - expected_result[j] += - ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + - c * z.h_view(i, j)) * - ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)); + expected_result[j] += ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)) * + ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -208,38 +174,31 @@ void impl_test_team_update_mv(int N, int K) { // KokkosBlas::update(a,x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::update( - teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, - Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( - "KokkosBlas::Test::TeamUpdate", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { + "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::update( - teamMember, a, - Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, - Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, - Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); + KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(const_non_const_result, expected_result[k], eps * expected_result[k]); } delete[] expected_result; @@ -249,58 +208,43 @@ void impl_test_team_update_mv(int N, int K) { template int test_team_update() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_update(0); - Test::impl_test_team_update(13); - Test::impl_test_team_update(124); + Test::impl_test_team_update(0); + Test::impl_test_team_update(13); + Test::impl_test_team_update(124); // Test::impl_test_team_update(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_update(0); - Test::impl_test_team_update(13); - Test::impl_test_team_update(124); + Test::impl_test_team_update(0); + Test::impl_test_team_update(13); + Test::impl_test_team_update(124); // Test::impl_test_team_update(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_update(0); - Test::impl_test_team_update(13); - Test::impl_test_team_update(124); + Test::impl_test_team_update(0); + Test::impl_test_team_update(13); + Test::impl_test_team_update(124); // Test::impl_test_team_update(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_update(124); - Test::impl_test_team_update(124); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_update(124); + Test::impl_test_team_update(124); #endif return 1; @@ -309,117 +253,79 @@ int test_team_update() { template int test_team_update_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_update_mv(0, 5); - Test::impl_test_team_update_mv(13, 5); - Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(0, 5); + Test::impl_test_team_update_mv(13, 5); + Test::impl_test_team_update_mv(124, 5); // Test::impl_test_team_update_mv(132231,5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_update_mv(0, 5); - Test::impl_test_team_update_mv(13, 5); - Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(0, 5); + Test::impl_test_team_update_mv(13, 5); + Test::impl_test_team_update_mv(124, 5); // Test::impl_test_team_update_mv(132231,5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_update_mv(0, 5); - Test::impl_test_team_update_mv(13, 5); - Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(0, 5); + Test::impl_test_team_update_mv(13, 5); + Test::impl_test_team_update_mv(124, 5); // Test::impl_test_team_update_mv(132231,5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_update_mv(124, 5); - Test::impl_test_team_update_mv(124, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_team_update_mv(124, 5); + Test::impl_test_team_update_mv(124, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_update_float) { - test_team_update(); -} -TEST_F(TestCategory, team_update_mv_float) { - test_team_update_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_update_float) { test_team_update(); } +TEST_F(TestCategory, team_update_mv_float) { test_team_update_mv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_update_double) { - test_team_update(); -} -TEST_F(TestCategory, team_update_mv_double) { - test_team_update_mv(); -} + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_update_double) { test_team_update(); } +TEST_F(TestCategory, team_update_mv_double) { test_team_update_mv(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_complex_double) { - test_team_update, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_update, Kokkos::complex, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_update_mv_complex_double) { - test_team_update_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_team_update_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_update_int) { - test_team_update(); -} -TEST_F(TestCategory, team_update_mv_int) { - test_team_update_mv(); -} +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, team_update_int) { test_team_update(); } +TEST_F(TestCategory, team_update_mv_int) { test_team_update_mv(); } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_update_double_int) { - test_team_update(); -} -TEST_F(TestCategory, team_update_double_mv_int) { - test_team_update_mv(); -} +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, team_update_double_int) { test_team_update(); } +TEST_F(TestCategory, team_update_double_mv_int) { test_team_update_mv(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index cfeddb9d3d..6152a3493b 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -37,8 +37,7 @@ void impl_test_update(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -64,27 +63,21 @@ void impl_test_update(int N) { KokkosBlas::update(a, x.d_view, b, y.d_view, c, z.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + - c * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + c * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::update(a, x.d_view_const, b, y.d_view, c, z.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + - c * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + c * org_z.h_view(i)), z.h_view(i), eps); } Kokkos::deep_copy(z.d_base, org_z.h_base); KokkosBlas::update(a, x.d_view_const, b, y.d_view_const, c, z.d_view); Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + - c * org_z.h_view(i)), - z.h_view(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + c * org_z.h_view(i)), z.h_view(i), eps); } } @@ -99,8 +92,7 @@ void impl_test_update_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarA randStart, randEnd; @@ -133,10 +125,8 @@ void impl_test_update_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + - c * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } @@ -145,10 +135,8 @@ void impl_test_update_mv(int N, int K) { Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + - c * org_z.h_view(i, j)), - z.h_view(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } } @@ -157,58 +145,43 @@ void impl_test_update_mv(int N, int K) { template int test_update() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); // Test::impl_test_update(132231); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); // Test::impl_test_update(132231); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); // Test::impl_test_update(132231); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update(1024); - Test::impl_test_update(1024); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update(1024); + Test::impl_test_update(1024); #endif return 1; @@ -217,66 +190,47 @@ int test_update() { template int test_update_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; - Test::impl_test_update_mv(0, 5); - Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_update_mv(0, 5); - Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_update_mv(0, 5); - Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(1024, 5); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(1024, 5); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_float"); test_update(); @@ -290,8 +244,7 @@ TEST_F(TestCategory, update_mv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double"); test_update(); @@ -304,25 +257,21 @@ TEST_F(TestCategory, update_mv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_complex_double"); - test_update, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_update, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_complex_double"); - test_update_mv, Kokkos::complex, - Kokkos::complex, TestDevice>(); + test_update_mv, Kokkos::complex, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_int"); test_update(); @@ -335,8 +284,7 @@ TEST_F(TestCategory, update_mv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, update_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double_int"); test_update(); diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index b3f3566f83..d70935c2ac 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -21,10 +21,8 @@ #include namespace Test { -template -void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, - int N) { +template +void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeX::value_type ScalarX; typedef typename ViewTypeY::value_type ScalarY; @@ -70,10 +68,8 @@ void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, Kokkos::fill_random(space, A.d_view, rand_pool, randStart, randEnd); } - const typename KAT_Y::mag_type max_error = - KAT_Y::abs(alpha * max_valA * max_valX * ldx + beta * max_valY); - const typename KAT_Y::mag_type tol = - max_error * eps * 2; // adding small fudge factor of 2 + const typename KAT_Y::mag_type max_error = KAT_Y::abs(alpha * max_valA * max_valX * ldx + beta * max_valY); + const typename KAT_Y::mag_type tol = max_error * eps * 2; // adding small fudge factor of 2 Kokkos::deep_copy(org_y.h_base, y.d_base); Kokkos::deep_copy(x.h_base, x.d_base); @@ -89,39 +85,33 @@ void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) { numErrors++; - std::cerr << __FILE__ << ":" << __LINE__ - << ": expected(i)=" << expected(i) << ", h_y(i)=" << y.h_view(i) + std::cerr << __FILE__ << ":" << __LINE__ << ": expected(i)=" << expected(i) << ", h_y(i)=" << y.h_view(i) << std::endl; } } - EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N - << ", alpha = " << alpha << ", beta = " << beta + EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(space, y.d_base, org_y.h_base); - KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view_const, beta, - y.d_view); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view_const, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; Kokkos::fence(); // Wait for vanillaGEMV for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } - EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N - << ", alpha = " << alpha << ", beta = " << beta + EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(space, y.d_base, org_y.h_base); - KokkosBlas::gemv(space, mode, alpha, A.d_view_const, x.d_view_const, beta, - y.d_view); + KokkosBlas::gemv(space, mode, alpha, A.d_view_const, x.d_view_const, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } - EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N - << ", alpha = " << alpha << ", beta = " << beta - << ", mode " << mode << ": gemv incorrect"; + EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha + << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; // Test once with beta = 0, but with y initially filled with NaN. // This should overwrite the NaNs with the correct result. beta = KAT_Y::zero(); @@ -135,32 +125,28 @@ void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, numErrors = 0; for (int i = 0; i < ldy; i++) { if (KAT_Y::isNan(y.h_view(i)) || - KAT_Y::abs(expected(i) - y.h_view(i)) > - KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { + KAT_Y::abs(expected(i) - y.h_view(i)) > KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { numErrors++; - std::cerr << __FILE__ << ":" << __LINE__ << ": expected(" << i - << ")=" << expected(i) << ", h_y(" << i << ")=" << y.h_view(i) - << ", eps=" << eps - << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << ": expected(" << i << ")=" << expected(i) << ", h_y(" << i + << ")=" << y.h_view(i) << ", eps=" << eps << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() + << std::endl; } } - EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' - << N << ", mode " << mode << ": gemv incorrect"; + EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode + << ": gemv incorrect"; } template void impl_test_gemv(const char* mode, int M, int N) { using execution_space = typename Device::execution_space; execution_space space; - impl_test_gemv_streams(space, mode, M, N); + impl_test_gemv_streams(space, mode, M, N); } } // namespace Test template int test_gemv(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ll; typedef Kokkos::View view_type_b_ll; typedef Kokkos::View view_type_c_ll; @@ -172,85 +158,58 @@ int test_gemv(const char* mode) { Test::impl_test_gemv(mode,10,200); Test::impl_test_gemv(mode,200,10); #endif - Test::impl_test_gemv( - mode, 0, 1024); - Test::impl_test_gemv( - mode, 1024, 0); - Test::impl_test_gemv( - mode, 13, 13); - Test::impl_test_gemv( - mode, 13, 1024); - Test::impl_test_gemv( - mode, 50, 40); - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 2131, 2131); + Test::impl_test_gemv(mode, 0, 1024); + Test::impl_test_gemv(mode, 1024, 0); + Test::impl_test_gemv(mode, 13, 13); + Test::impl_test_gemv(mode, 13, 1024); + Test::impl_test_gemv(mode, 50, 40); + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 2131, 2131); // Test::impl_test_gemv(mode,132231,1024); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; - Test::impl_test_gemv( - mode, 0, 1024); - Test::impl_test_gemv( - mode, 1024, 0); - Test::impl_test_gemv( - mode, 13, 13); - Test::impl_test_gemv( - mode, 13, 1024); - Test::impl_test_gemv( - mode, 50, 40); - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 2131, 2131); + Test::impl_test_gemv(mode, 0, 1024); + Test::impl_test_gemv(mode, 1024, 0); + Test::impl_test_gemv(mode, 13, 13); + Test::impl_test_gemv(mode, 13, 1024); + Test::impl_test_gemv(mode, 50, 40); + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 2131, 2131); // Test::impl_test_gemv(mode,132231,1024); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; - Test::impl_test_gemv( - mode, 0, 1024); - Test::impl_test_gemv( - mode, 1024, 0); - Test::impl_test_gemv( - mode, 13, 13); - Test::impl_test_gemv( - mode, 13, 1024); - Test::impl_test_gemv( - mode, 50, 40); - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 2131, 2131); + Test::impl_test_gemv(mode, 0, 1024); + Test::impl_test_gemv(mode, 1024, 0); + Test::impl_test_gemv(mode, 13, 13); + Test::impl_test_gemv(mode, 13, 1024); + Test::impl_test_gemv(mode, 50, 40); + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 2131, 2131); // Test::impl_test_gemv(mode,132231,1024); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 1024, 1024); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_gemv(mode, 1024, 1024); + Test::impl_test_gemv(mode, 1024, 1024); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_float"); test_gemv("N"); @@ -263,8 +222,7 @@ TEST_F(TestCategory, gemv_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double"); test_gemv("N"); @@ -277,29 +235,24 @@ TEST_F(TestCategory, gemv_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_complex_double"); - test_gemv, Kokkos::complex, - Kokkos::complex, TestDevice>("N"); + test_gemv, Kokkos::complex, Kokkos::complex, TestDevice>("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_complex_double"); - test_gemv, Kokkos::complex, - Kokkos::complex, TestDevice>("T"); + test_gemv, Kokkos::complex, Kokkos::complex, TestDevice>("T"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double"); - test_gemv, Kokkos::complex, - Kokkos::complex, TestDevice>("C"); + test_gemv, Kokkos::complex, Kokkos::complex, TestDevice>("C"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_int"); test_gemv("N"); @@ -311,8 +264,7 @@ TEST_F(TestCategory, gemv_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, gemv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double_int"); test_gemv("N"); @@ -332,34 +284,33 @@ int test_gemv_streams(const char* mode) { using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; using view_type_c_ll = Kokkos::View; - Test::impl_test_gemv_streams(space, mode, 0, 1024); - Test::impl_test_gemv_streams(space, mode, 13, 1024); - Test::impl_test_gemv_streams(space, mode, 50, 40); + Test::impl_test_gemv_streams(space, mode, 0, + 1024); + Test::impl_test_gemv_streams(space, mode, 13, + 1024); + Test::impl_test_gemv_streams(space, mode, 50, + 40); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) using view_type_a_lr = Kokkos::View; using view_type_b_lr = Kokkos::View; using view_type_c_lr = Kokkos::View; - Test::impl_test_gemv_streams(space, mode, 0, 1024); - Test::impl_test_gemv_streams(space, mode, 13, 1024); - Test::impl_test_gemv_streams(space, mode, 50, 40); + Test::impl_test_gemv_streams(space, mode, 0, + 1024); + Test::impl_test_gemv_streams(space, mode, 13, + 1024); + Test::impl_test_gemv_streams(space, mode, 50, + 40); #endif (void)space; return 1; } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - blas##_##gemv_streams##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gemv_streams("N"); \ - test_gemv_streams("T"); \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, blas##_##gemv_streams##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gemv_streams("N"); \ + test_gemv_streams("T"); \ } #define NO_TEST_COMPLEX diff --git a/blas/unit_test/Test_Blas2_gemv_util.hpp b/blas/unit_test/Test_Blas2_gemv_util.hpp index e28310c8eb..724a2fc004 100644 --- a/blas/unit_test/Test_Blas2_gemv_util.hpp +++ b/blas/unit_test/Test_Blas2_gemv_util.hpp @@ -23,16 +23,12 @@ namespace Test { -template ::value> -using simd_vector = - KokkosBatched::Vector, length>; +template ::value> +using simd_vector = KokkosBatched::Vector, length>; template struct GemvOpBase { - GemvOpBase(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + GemvOpBase(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : trans(trans_), alpha(alpha_), beta(beta_), A(A_), x(x_), y(y_) {} protected: @@ -52,42 +48,32 @@ template struct RefGEMVOp : public GemvOpBase { using params = GemvOpBase; - RefGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + RefGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template - KOKKOS_INLINE_FUNCTION void operator()( - const TeamMember & /* member */) const { - vanillaGEMV(params::trans, params::alpha, params::A, params::x, - params::beta, params::y); + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember & /* member */) const { + vanillaGEMV(params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; // RefGEMVOp // fill regular view with random values -template -typename std::enable_if::value>::type -fill_random_view(ViewType A, PoolType &rand_pool, - const ScalarType max_val = 10.0) { +template +typename std::enable_if::value>::type fill_random_view( + ViewType A, PoolType &rand_pool, const ScalarType max_val = 10.0) { Kokkos::fill_random(A, rand_pool, max_val); Kokkos::fence(); } // fill rank-1 view of SIMD vectors with random values -template +template void fill_random_view( - Kokkos::View< - KokkosBatched::Vector, VecLength> *, - Layout, Props...> - x, + Kokkos::View, VecLength> *, Layout, Props...> x, PoolType &rand_pool, const ValueType max_val = 10.0) { // the view can be strided and have Vector values, so randoms // are generated in a plain, linear view first and then copied using device_type = typename decltype(x)::device_type; - Kokkos::View rnd("random_vals", - x.extent(0) * VecLength); + Kokkos::View rnd("random_vals", x.extent(0) * VecLength); Kokkos::fill_random(rnd, rand_pool, max_val); using size_type = decltype(x.extent(0)); for (size_type i = 0; i < x.extent(0); ++i) { @@ -96,19 +82,14 @@ void fill_random_view( } // fill rank-2 view of SIMD vectors with random values -template +template static void fill_random_view( - Kokkos::View< - KokkosBatched::Vector, VecLength> **, - Layout, Props...> - A, + Kokkos::View, VecLength> **, Layout, Props...> A, PoolType &rand_pool, const ValueType max_val = 10.0) { // the view can be strided and have Vector values, so randoms // are generated in a plain, linear view first and then copied using device_type = typename decltype(A)::device_type; - Kokkos::View rnd( - "random_vals", A.extent(0) * A.extent(1) * VecLength); + Kokkos::View rnd("random_vals", A.extent(0) * A.extent(1) * VecLength); Kokkos::fill_random(rnd, rand_pool, max_val); using size_type = decltype(A.extent(0)); size_type idx = 0; @@ -120,29 +101,22 @@ static void fill_random_view( } } -template +template struct GEMVTest { - static void run(const char *mode) { - run_algorithms<0, typename GemvFunc::algorithms>(mode); - } + static void run(const char *mode) { run_algorithms<0, typename GemvFunc::algorithms>(mode); } private: // ScalarCoef==void default behavior is to derive alpha/beta scalar types // from A and X scalar types - using ScalarType = typename std::conditional< - !std::is_void::value, ScalarCoef, - typename std::common_type::type>::type; + using ScalarType = typename std::conditional::value, ScalarCoef, + typename std::common_type::type>::type; template - static std::enable_if_t::value> - run_algorithms(const char * /*mode*/) {} + static std::enable_if_t::value> run_algorithms(const char * /*mode*/) {} template - static - typename std::enable_if<(Idx < - std::tuple_size::value)>::type - run_algorithms(const char *mode) { + static typename std::enable_if<(Idx < std::tuple_size::value)>::type run_algorithms( + const char *mode) { run_layouts::type>(mode); run_algorithms(mode); } @@ -156,8 +130,7 @@ struct GEMVTest { #ifdef KOKKOSKERNELS_TEST_LAYOUTRIGHT run_view_types(mode); #endif -#if defined(KOKKOSKERNELS_TEST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_TEST_LAYOUTRIGHT) +#if defined(KOKKOSKERNELS_TEST_LAYOUTLEFT) && defined(KOKKOSKERNELS_TEST_LAYOUTRIGHT) using A_t = typename Kokkos::View; using x_t = typename Kokkos::View; using y_t = typename Kokkos::View; @@ -224,24 +197,16 @@ struct GEMVTest { auto y = Kokkos::subview(b_y, 0, Kokkos::ALL(), 0); // make sure it's actually LayoutStride there - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value, ""); + static_assert(std::is_same::value, ""); + static_assert(std::is_same::value, ""); run_views(trans, A, x, y); } } template - static void run_views(const char trans, ViewTypeA A, ViewTypeX x, - ViewTypeY y) { - Kokkos::TeamPolicy teams( - 1, 1); // just run on device + static void run_views(const char trans, ViewTypeA A, ViewTypeX x, ViewTypeY y) { + Kokkos::TeamPolicy teams(1, 1); // just run on device fill_inputs(A, x, y); ScalarType alpha = 3; // TODO: test also with zero alpha/beta ? ScalarType beta = 5; @@ -249,8 +214,7 @@ struct GEMVTest { // get reference results Kokkos::View y_ref("Y_ref", y.extent(0)); Kokkos::deep_copy(y_ref, y); - RefGEMVOp gemv_ref( - trans, alpha, A, x, beta, y_ref); + RefGEMVOp gemv_ref(trans, alpha, A, x, beta, y_ref); Kokkos::parallel_for(teams, gemv_ref); // 1. check non-consts @@ -265,10 +229,8 @@ struct GEMVTest { run_case(trans, alpha, c_A, c_x, beta, y, y_ref); } - template - static void run_case(const char trans, ScalarType alpha, ViewTypeA A, - ViewTypeX x, ScalarType beta, ViewTypeY y, + template + static void run_case(const char trans, ScalarType alpha, ViewTypeA A, ViewTypeX x, ScalarType beta, ViewTypeY y, ViewTypeYRef y_ref) { // run on original y view (not to alter the test) // but backup it and restore, so it can be reused @@ -277,12 +239,10 @@ struct GEMVTest { // fetch GEMV functor from the factory using op_type = - typename GemvFunc::template functor_type; + typename GemvFunc::template functor_type; op_type gemv_op(trans, alpha, A, x, beta, y); - Kokkos::parallel_for( - Kokkos::TeamPolicy(1, 1), gemv_op); + Kokkos::parallel_for(Kokkos::TeamPolicy(1, 1), gemv_op); const double eps = epsilon(ScalarY{}); EXPECT_NEAR_KK_REL_1DVIEW(y, y_ref, eps); @@ -317,24 +277,15 @@ struct GEMVTest { } // namespace Test -#define TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR_A, SCALAR_X, SCALAR_Y, \ - SCALAR_COEF) \ - using PREFIX##_##NAME##_gemv_test = \ - ::Test::GEMVTest<::Test::FACTORY, SCALAR_A, SCALAR_X, SCALAR_Y, \ - TestDevice, SCALAR_COEF>; \ - TEST_F(TestCategory, PREFIX##_gemv_nt_##NAME) { \ - PREFIX##_##NAME##_gemv_test::run("N"); \ - } \ - TEST_F(TestCategory, PREFIX##_gemv_t_##NAME) { \ - PREFIX##_##NAME##_gemv_test::run("T"); \ - } \ - TEST_F(TestCategory, PREFIX##_gemv_ct_##NAME) { \ - PREFIX##_##NAME##_gemv_test::run("C"); \ - } +#define TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR_A, SCALAR_X, SCALAR_Y, SCALAR_COEF) \ + using PREFIX##_##NAME##_gemv_test = \ + ::Test::GEMVTest<::Test::FACTORY, SCALAR_A, SCALAR_X, SCALAR_Y, TestDevice, SCALAR_COEF>; \ + TEST_F(TestCategory, PREFIX##_gemv_nt_##NAME) { PREFIX##_##NAME##_gemv_test::run("N"); } \ + TEST_F(TestCategory, PREFIX##_gemv_t_##NAME) { PREFIX##_##NAME##_gemv_test::run("T"); } \ + TEST_F(TestCategory, PREFIX##_gemv_ct_##NAME) { PREFIX##_##NAME##_gemv_test::run("C"); } #define TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR_COEF) \ TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR, SCALAR, SCALAR, SCALAR_COEF) -#define TEST_CASE(PREFIX, FACTORY, NAME, SCALAR) \ - TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR) +#define TEST_CASE(PREFIX, FACTORY, NAME, SCALAR) TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR) #endif // TEST_BLAS2_GEMV_UTIL_HPP diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 9a8f740569..6e975532e1 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -53,107 +53,85 @@ namespace Test { -template +template class GerTester { public: GerTester(); ~GerTester(); - void test(const int M, const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults = false, - const bool useHermitianOption = false); + void test(const int M, const int N, const int nonConstConstCombinations, const bool useAnalyticalResults = false, + const bool useHermitianOption = false); private: using _ViewTypeX = Kokkos::View; using _ViewTypeY = Kokkos::View; using _ViewTypeA = Kokkos::View; - using _HostViewTypeX = typename _ViewTypeX::HostMirror; - using _HostViewTypeY = typename _ViewTypeY::HostMirror; - using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = - Kokkos::View; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown); + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkGerAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation); + void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -169,16 +147,13 @@ class GerTester { bool _kkGerShouldThrowException; }; -template -GerTester::GerTester() +template +GerTester::GerTester() : _A_is_complex(std::is_same>::value || std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< - typename Device::execution_space>()) + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr && _testIsGpu) @@ -195,12 +170,8 @@ GerTester::value - ? 1.0e-6 - : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value - ? 5.0e-3 - : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -208,31 +179,24 @@ GerTester -GerTester::~GerTester() { +template +GerTester::~GerTester() { // Nothing to do } -template -void GerTester::test(const int M, const int N, - const int nonConstConstCombinations, - const bool useAnalyticalResults, - const bool useHermitianOption) { +template +void GerTester::test( + const int M, const int N, const int nonConstConstCombinations, const bool useAnalyticalResults, + const bool useHermitianOption) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Entering GerTester::test()... - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - - - " << std::endl; - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + std::cout << "_A_is_complex = " << _A_is_complex << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " - << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol << ", _relTol = " << _relTol << std::endl; #endif // ******************************************************************** @@ -277,8 +241,7 @@ void GerTester y("Y", _N); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected( - "expected A += alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * y^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(0.); @@ -286,21 +249,16 @@ void GerTesterpopulateVariables(alpha, x, y, A, h_expected.d_view, - expectedResultIsKnown); + this->populateVariables(alpha, x, y, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 9: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla( - "vanilla = A + alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * y^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG - Kokkos::printf( - "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", - typeid(alpha).name()); + Kokkos::printf("In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); #endif - this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, - h_vanilla.d_view); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); // ******************************************************************** // Step 4 of 9: use h_vanilla and h_expected as appropriate @@ -309,8 +267,7 @@ void GerTestercompareVanillaAgainstExpected(alpha, h_vanilla.d_view, - h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -325,8 +282,7 @@ void GerTestercallKkGerAndCompareAgainstExpected( - alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const {x,y}"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const {x,y}"); } // ******************************************************************** @@ -335,8 +291,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A, - h_expected.d_view, "const x"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A, h_expected.d_view, "const x"); } // ******************************************************************** @@ -345,8 +300,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A, - h_expected.d_view, "const y"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A, h_expected.d_view, "const y"); } // ******************************************************************** @@ -355,9 +309,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, - y.d_view_const, A, - h_expected.d_view, "const {x,y}"); + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, + "const {x,y}"); } // ******************************************************************** @@ -376,21 +329,14 @@ void GerTester -void GerTester< - ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, - Device>::populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { +template +void GerTester::populateVariables( + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, - h_expected); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); Kokkos::deep_copy(x.d_base, x.h_base); Kokkos::deep_copy(y.d_base, y.h_base); Kokkos::deep_copy(A.d_base, A.h_base); @@ -455,8 +401,7 @@ void GerTester< } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarX randStart, randEnd; @@ -483,17 +428,12 @@ void GerTester< } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { _AuxType auxI(0.); _AuxType auxJ(0.); _AuxType auxIpJ(0.); @@ -518,26 +458,20 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - h_A(i, j).real() = - -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); - h_A(i, j).imag() = - -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + h_A(i, j).imag() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); } } } else { for (int i = 0; i < _M; ++i) { auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); - h_A(i, j).real() = - -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); - h_A(i, j).imag() = - -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_A(i, j).real() = -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + h_A(i, j).imag() = -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); } } } @@ -546,9 +480,8 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j).real() = -2. * sin(auxI) * sin(auxJ); h_expected(i, j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); } @@ -557,9 +490,8 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j).real() = 2. * cos(auxI) * cos(auxJ); h_expected(i, j).imag() = -2. * sin(auxImJ); } @@ -568,17 +500,12 @@ GerTester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { _AuxType auxI(0.); _AuxType auxJ(0.); _AuxType auxIpJ(0.); @@ -605,25 +532,20 @@ GerTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j) = 3 * sin(auxIpJ); } } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { @@ -656,18 +578,13 @@ GerTester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { @@ -683,11 +600,10 @@ GerTester +template template -T GerTester::shrinkAngleToZeroTwoPiRange(const T input) { +T GerTester::shrinkAngleToZeroTwoPiRange( + const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -702,18 +618,13 @@ T GerTester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsRealAbs(0); @@ -732,7 +643,7 @@ GerTester:: for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -756,17 +667,15 @@ GerTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << ", _KAT_A::abs(h_expected(i,j).real() - " "h_vanilla(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } - diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -790,37 +699,26 @@ GerTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << ", _KAT_A::abs(h_expected(i,j).imag() - " "h_vanilla(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_vanilla(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -829,29 +727,19 @@ GerTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_vanilla(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -860,8 +748,7 @@ GerTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } else { int numErrorsReal(0); @@ -872,11 +759,8 @@ GerTester:: if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " - << h_expected(i, j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; #endif } numErrorsReal++; @@ -885,49 +769,37 @@ GerTester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " - << h_expected(i, j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; #endif } numErrorsImag++; } } // for j } // for i - EXPECT_EQ(numErrorsReal, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsAbs(0); @@ -965,35 +837,24 @@ GerTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_vanilla(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_vanilla(i,j) = " << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1012,8 +873,7 @@ GerTester:: if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; #endif } @@ -1021,29 +881,22 @@ GerTester:: } } // for j } // for i - EXPECT_EQ(numErrors, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -GerTester:: - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +GerTester::compareKkGerAgainstExpected( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsRealAbs(0); int numErrorsRealRel(0); @@ -1084,12 +937,10 @@ GerTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() - << ", h_A(i,j).real() = " << h_A(i, j).real() - << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } @@ -1117,90 +968,56 @@ GerTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() - << ", h_A(i,j).imag() = " << h_A(i, j).imag() - << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", h_expected(11, 2119) = (" << h_expected(11, 2119).real() - << ", " << h_expected(11, 2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " - << h_A(11, 2119).imag() << ")" << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(11, 2119) = (" << h_expected(11, 2119).real() << ", " << h_expected(11, 2119).imag() + << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", h_expected(710, 1065) = (" << h_expected(710, 1065).real() - << ", " << h_expected(710, 1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " - << h_A(710, 1065).imag() << ")" << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(710, 1065) = (" << h_expected(710, 1065).real() << ", " << h_expected(710, 1065).imag() + << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " << h_A(710, 1065).imag() << ")" << std::endl; } #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -1213,24 +1030,15 @@ GerTester:: } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -1244,17 +1052,13 @@ GerTester:: } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -GerTester:: - compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected) { - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +GerTester::compareKkGerAgainstExpected( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsAbs(0); int numErrorsRel(0); @@ -1290,52 +1094,33 @@ GerTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) - << ", h_A(i,j) = " << h_A(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1348,15 +1133,11 @@ GerTester:: } } -template +template template -void GerTester:: - callKkGerAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation) { +void GerTester::callKkGerAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, const _ViewTypeExpected& h_expected, + const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): " @@ -1370,25 +1151,21 @@ void GerTester::value || - std::is_same::value || + bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool yBool = std::is_same::value || - std::is_same::value || + bool yBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool aBool = std::is_same::value || - std::is_same::value || + bool aBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; bool useAnalyticalResults = xBool && yBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1435,8 +1207,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); @@ -1471,8 +1242,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1480,8 +1250,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); @@ -1515,8 +1284,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1524,8 +1292,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); @@ -1556,8 +1323,7 @@ int test_ger(const std::string& /*caseName*/) { #endif #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1565,8 +1331,7 @@ int test_ger(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); #endif if (true) { - Test::GerTester + Test::GerTester tester; tester.test(1024, 1024, 0); if (useAnalyticalResults) { @@ -1578,8 +1343,7 @@ int test_ger(const std::string& /*caseName*/) { } if (true) { - Test::GerTester + Test::GerTester tester; tester.test(1024, 1024, 0); } @@ -1602,8 +1366,7 @@ int test_ger(const std::string& /*caseName*/) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_float"); test_ger("test case ger_float"); @@ -1612,19 +1375,17 @@ TEST_F(TestCategory, ger_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_float"); - test_ger, Kokkos::complex, - Kokkos::complex, TestDevice>("test case ger_complex_float"); + test_ger, Kokkos::complex, Kokkos::complex, TestDevice>( + "test case ger_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double"); test_ger("test case ger_double"); @@ -1633,19 +1394,17 @@ TEST_F(TestCategory, ger_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_double"); - test_ger, Kokkos::complex, - Kokkos::complex, TestDevice>("test case ger_complex_double"); + test_ger, Kokkos::complex, Kokkos::complex, TestDevice>( + "test case ger_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_int"); test_ger("test case ger_int"); @@ -1653,8 +1412,7 @@ TEST_F(TestCategory, ger_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, ger_double_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int_float"); test_ger("test case ger_double_int_float"); diff --git a/blas/unit_test/Test_Blas2_serial_gemv.hpp b/blas/unit_test/Test_Blas2_serial_gemv.hpp index 5c1aaf5a67..805ac1d283 100644 --- a/blas/unit_test/Test_Blas2_serial_gemv.hpp +++ b/blas/unit_test/Test_Blas2_serial_gemv.hpp @@ -21,39 +21,31 @@ namespace Test { -template +template struct SerialGEMVOp : public GemvOpBase { using params = GemvOpBase; - SerialGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + SerialGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { KokkosBlas::Experimental::Gemv::invoke( - member, params::trans, params::alpha, params::A, params::x, - params::beta, params::y); + member, params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; struct SerialGemvFactory { - template - using functor_type = - SerialGEMVOp; + template + using functor_type = SerialGEMVOp; - using algorithms = std::tuple; + using algorithms = std::tuple; }; #ifdef __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ struct SerialMKLGemvFactory { - template - using functor_type = - SerialGEMVOp; + template + using functor_type = SerialGEMVOp; using algorithms = std::tuple; }; @@ -61,10 +53,8 @@ struct SerialMKLGemvFactory { } // namespace Test -#define TEST_SERIAL_CASE4(N, A, X, Y, SC) \ - TEST_CASE4(serial, SerialGemvFactory, N, A, X, Y, SC) -#define TEST_SERIAL_CASE2(N, S, SC) \ - TEST_CASE2(serial, SerialGemvFactory, N, S, SC) +#define TEST_SERIAL_CASE4(N, A, X, Y, SC) TEST_CASE4(serial, SerialGemvFactory, N, A, X, Y, SC) +#define TEST_SERIAL_CASE2(N, S, SC) TEST_CASE2(serial, SerialGemvFactory, N, S, SC) #define TEST_SERIAL_CASE(N, S) TEST_CASE(serial, SerialGemvFactory, N, S) #ifdef KOKKOSKERNELS_TEST_FLOAT @@ -76,8 +66,7 @@ using simd_float_avx = ::Test::simd_vector; using simd_float_avx512 = ::Test::simd_vector; TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_sse, simd_float_sse, float) TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx, simd_float_avx, float) -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx512, simd_float_avx512, - float) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx512, simd_float_avx512, float) #endif #endif @@ -88,12 +77,9 @@ TEST_SERIAL_CASE(double, double) using simd_double_sse = ::Test::simd_vector; using simd_double_avx = ::Test::simd_vector; using simd_double_avx512 = ::Test::simd_vector; -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_sse, simd_double_sse, - double) -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx, simd_double_avx, - double) -TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx512, simd_double_avx512, - double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_sse, simd_double_sse, double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx, simd_double_avx, double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx512, simd_double_avx512, double) #endif #endif diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 5658ca5ea1..8dc7cadf51 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -51,110 +51,85 @@ namespace Test { -template +template class SyrTester { public: SyrTester(); ~SyrTester(); - void test(const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults = false, - const bool useHermitianOption = false, - const bool useUpOption = false); + void test(const int N, const int nonConstConstCombinations, const bool useAnalyticalResults = false, + const bool useHermitianOption = false, const bool useUpOption = false); private: using _ViewTypeX = Kokkos::View; using _ViewTypeA = Kokkos::View; - using _HostViewTypeX = typename _ViewTypeX::HostMirror; - using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = - Kokkos::View; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeA& h_A, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeA& h_A, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyrAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation); + void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); template - void callKkGerAndCompareKkSyrAgainstIt( - const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr, const std::string& situation); + void callKkGerAndCompareKkSyrAgainstIt(const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& org_A, + const _HostViewTypeA& h_A_syr, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -172,15 +147,13 @@ class SyrTester { bool _kkGerShouldThrowException; }; -template +template SyrTester::SyrTester() : _A_is_complex(std::is_same>::value || std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< - typename Device::execution_space>()) + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) @@ -197,12 +170,8 @@ SyrTester::SyrTester() // large enough to require 'relTol' to value 5.0e-3. The same // calculations show no discrepancies for calculations with double. // **************************************************************** - _absTol(std::is_same<_AuxType, float>::value - ? 1.0e-6 - : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value - ? 5.0e-3 - : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -212,33 +181,27 @@ SyrTester::SyrTester() _kkGerShouldThrowException(false) { } -template +template SyrTester::~SyrTester() { // Nothing to do } -template -void SyrTester::test( - const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults, const bool useHermitianOption, - const bool useUpOption) { +template +void SyrTester::test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults, + const bool useHermitianOption, + const bool useUpOption) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Entering SyrTester::test()... - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - - - " << std::endl; - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + std::cout << "_A_is_complex = " << _A_is_complex << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " - << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol - << ", _relTol = " << _relTol - << ", nonConstConstCombinations = " << nonConstConstCombinations - << ", useAnalyticalResults = " << useAnalyticalResults - << ", useHermitianOption = " << useHermitianOption + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults << ", useHermitianOption = " << useHermitianOption << ", useUpOption = " << useUpOption << std::endl; #endif // ******************************************************************** @@ -273,8 +236,7 @@ void SyrTester::test( view_stride_adapter<_ViewTypeX, false> x("X", _M); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected( - "expected A += alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(_KAT_A::zero()); @@ -282,18 +244,14 @@ void SyrTester::test( // ******************************************************************** // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A // ******************************************************************** - this->populateVariables(alpha, x, A, h_expected.d_view, - expectedResultIsKnown); + this->populateVariables(alpha, x, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla( - "vanilla = A + alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG - Kokkos::printf( - "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", - typeid(alpha).name()); + Kokkos::printf("In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); #endif this->populateVanillaValues(alpha, x.h_view, A.h_view, h_vanilla.d_view); @@ -304,8 +262,7 @@ void SyrTester::test( // ****************************************************************** // Compare h_vanilla against h_expected // ****************************************************************** - this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, - h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -321,13 +278,11 @@ void SyrTester::test( Kokkos::deep_copy(org_A.h_view, A.h_view); if (test_x) { - this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view, A, - h_expected.d_view, "non const x"); + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view, A, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { - this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, A.h_view, - "non const x"); + this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, A.h_view, "non const x"); } } @@ -337,8 +292,7 @@ void SyrTester::test( if (test_cx) { Kokkos::deep_copy(A.d_base, org_A.d_base); - this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A, - h_expected.d_view, "const x"); + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A, h_expected.d_view, "const x"); } // ******************************************************************** @@ -361,12 +315,10 @@ void SyrTester::test( #endif } -template +template void SyrTester::populateVariables( - ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { @@ -416,8 +368,7 @@ void SyrTester::populateVariables( } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarX randStart, randEnd; @@ -464,8 +415,7 @@ void SyrTester::populateVariables( if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view(i, j) - << std::endl; + std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view(i, j) << std::endl; } } } @@ -473,16 +423,13 @@ void SyrTester::populateVariables( } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { if (_useHermitianOption) { alpha.real() = 1.; alpha.imag() = 0.; @@ -500,10 +447,8 @@ SyrTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_A(i, j).real() = cos(auxImJ); h_A(i, j).imag() = -sin(auxImJ); } else { @@ -515,8 +460,7 @@ SyrTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_A(i, j).real() = sin(auxIpJ) + cos(auxIpJ); h_A(i, j).imag() = sin(auxIpJ) - cos(auxIpJ); } @@ -526,10 +470,8 @@ SyrTester= j))) { - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j).real() = 2. * cos(auxImJ); h_expected(i, j).imag() = -2. * sin(auxImJ); } else { @@ -541,10 +483,8 @@ SyrTester= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j).real() = 2. * sin(auxIpJ); h_expected(i, j).imag() = 2. * sin(auxIpJ); } else { @@ -557,16 +497,13 @@ SyrTester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { alpha = 2; for (int i = 0; i < _M; ++i) { @@ -577,18 +514,15 @@ SyrTestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - _AuxType auxJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); - h_A(i, j) = 2 * cos(auxI) * cos(auxJ); + _AuxType auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_A(i, j) = 2 * cos(auxI) * cos(auxJ); } } for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j) = 2 * cos(auxImJ); } else { h_expected(i, j) = h_A(i, j); @@ -598,21 +532,19 @@ SyrTester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester::populateVanillaValues( - const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * _KAT_A::conj(h_x(j)) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); @@ -625,8 +557,7 @@ SyrTester::populateVanillaValues( } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); @@ -638,8 +569,7 @@ SyrTester::populateVanillaValues( if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_x(j)); } else { h_vanilla(i, j) = h_A(i, j); @@ -652,8 +582,7 @@ SyrTester::populateVanillaValues( } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); @@ -665,20 +594,18 @@ SyrTester::populateVanillaValues( } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester::populateVanillaValues( - const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); @@ -688,8 +615,7 @@ SyrTester::populateVanillaValues( } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); @@ -699,11 +625,9 @@ SyrTester::populateVanillaValues( } } -template +template template -T SyrTester::shrinkAngleToZeroTwoPiRange(const T input) { +T SyrTester::shrinkAngleToZeroTwoPiRange(const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -718,29 +642,23 @@ T SyrTester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) << ", h_van(" << i << "," << j + << ")=" << h_vanilla(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsRealAbs(0); @@ -759,7 +677,7 @@ SyrTester:: for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -783,17 +701,15 @@ SyrTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << ", _KAT_A::abs(h_expected(i,j).real() - " "h_vanilla(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } - diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -817,13 +733,11 @@ SyrTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << ", _KAT_A::abs(h_expected(i,j).imag() - " "h_vanilla(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -831,25 +745,15 @@ SyrTester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_vanilla(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -858,30 +762,19 @@ SyrTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_vanilla(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -890,8 +783,7 @@ SyrTester:: std::cout << "WARNING" << msg.str() << std::endl; #endif } - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } else { int numErrorsReal(0); @@ -902,11 +794,8 @@ SyrTester:: if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " - << h_expected(i, j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; #endif } numErrorsReal++; @@ -915,63 +804,50 @@ SyrTester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " - << h_expected(i, j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; #endif } numErrorsImag++; } } // for j } // for i - EXPECT_EQ(numErrorsReal, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) - << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) << ", h_van(" << i << "," << j + << ")=" << h_vanilla(i, j) << std::endl; #endif } } } - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsAbs(0); @@ -1009,12 +885,10 @@ SyrTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -1022,24 +896,14 @@ SyrTester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_vanilla(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_vanilla(i,j) = " << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1058,8 +922,7 @@ SyrTester:: if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; #endif } @@ -1067,42 +930,34 @@ SyrTester:: } } // for j } // for i - EXPECT_EQ(numErrors, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -SyrTester:: - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +SyrTester::compareKkSyrAgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) - << ", h_A(" << i << "," << j << ")=" << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) << ", h_A(" << i << "," << j + << ")=" << h_A(i, j) << std::endl; #endif } } } - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsRealAbs(0); int numErrorsRealRel(0); @@ -1143,12 +998,10 @@ SyrTester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).real() = " << h_reference(i, j).real() - << ", h_A(i,j).real() = " << h_A(i, j).real() - << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } @@ -1176,95 +1029,58 @@ SyrTester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() - << ", h_A(i,j).imag() = " << h_A(i, j).imag() - << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() << ", " << h_reference(11, 2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " - << h_A(11, 2119).imag() << ")" << std::endl; + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() << ", " << h_reference(710, 1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " - << h_A(710, 1065).imag() << ")" << std::endl; + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " << h_A(710, 1065).imag() << ")" << std::endl; } #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -1277,25 +1093,15 @@ SyrTester:: } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -1309,28 +1115,23 @@ SyrTester:: } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -SyrTester:: - compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +SyrTester::compareKkSyrAgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) - << ", h_A(" << i << "," << j << ")=" << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) << ", h_A(" << i << "," << j + << ")=" << h_A(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsAbs(0); int numErrorsRel(0); @@ -1366,53 +1167,34 @@ SyrTester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j) = " << h_reference(i, j) - << ", h_A(i,j) = " << h_A(i, j) - << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption << ": syr result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1425,16 +1207,13 @@ SyrTester:: } } -template +template template -void SyrTester:: - callKkSyrAndCompareAgainstExpected( - const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation) { +void SyrTester::callKkSyrAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, const _ViewTypeExpected& h_expected, + const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha - << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; Kokkos::printf( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", @@ -1448,25 +1227,21 @@ void SyrTester:: KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A.d_view); } catch (const std::exception& e) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "': caught exception, e.what() = " << e.what() << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "': caught exception, e.what() = " << e.what() << std::endl; #endif gotStdException = true; } catch (...) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "': caught unknown exception" << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "': caught unknown exception" << std::endl; #endif gotUnknownException = true; } - EXPECT_EQ(gotUnknownException, false) - << "Failed test, '" << situation - << "': unknown exception should not have happened"; + EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation + << "': unknown exception should not have happened"; EXPECT_EQ(gotStdException, _kkSyrShouldThrowException) - << "Failed test, '" << situation << "': kk syr() should" - << (_kkSyrShouldThrowException ? " " : " not ") + << "Failed test, '" << situation << "': kk syr() should" << (_kkSyrShouldThrowException ? " " : " not ") << "have thrown a std::exception"; if ((gotStdException == false) && (gotUnknownException == false)) { @@ -1475,14 +1250,11 @@ void SyrTester:: } } -template +template template -void SyrTester:: - callKkGerAndCompareKkSyrAgainstIt( - const ScalarA& alpha, TX& x, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr, const std::string& situation) { +void SyrTester::callKkGerAndCompareKkSyrAgainstIt( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& org_A, const _HostViewTypeA& h_A_syr, + const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, org_A.d_base); @@ -1490,8 +1262,7 @@ void SyrTester:: // Call ger() // ******************************************************************** #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha - << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; Kokkos::printf( "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", @@ -1504,39 +1275,33 @@ void SyrTester:: KokkosBlas::ger(mode.c_str(), alpha, x, x, A_ger.d_view); } catch (const std::exception& e) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "', ger() call: caught exception, e.what() = " << e.what() + std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught exception, e.what() = " << e.what() << std::endl; #endif gotStdException = true; } catch (...) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr, '" << situation - << "', ger() call: caught unknown exception" << std::endl; + std::cout << "In Test_Blas2_syr, '" << situation << "', ger() call: caught unknown exception" << std::endl; #endif gotUnknownException = true; } - EXPECT_EQ(gotUnknownException, false) - << "Failed test, '" << situation - << "': unknown exception should not have happened for ger() call"; + EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call"; - EXPECT_EQ(gotStdException, false) - << "Failed test, '" << situation - << "': kk ger() should not have thrown a std::exception"; + EXPECT_EQ(gotStdException, false) << "Failed test, '" << situation + << "': kk ger() should not have thrown a std::exception"; // ******************************************************************** // Prepare h_ger_reference to be compared against h_A_syr // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_ger_reference( - "h_ger_reference", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference("h_ger_reference", _M, _N); Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); std::string uplo = _useUpOption ? "U" : "L"; for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { // Keep h_ger_reference as already computed } else { h_ger_reference.d_view(i, j) = org_A.h_view(i, j); @@ -1545,9 +1310,7 @@ void SyrTester:: } if (_useHermitianOption && _A_is_complex) { for (int i(0); i < _N; ++i) { - h_ger_reference.d_view(i, i) = - 0.5 * (h_ger_reference.d_view(i, i) + - _KAT_A::conj(h_ger_reference.d_view(i, i))); + h_ger_reference.d_view(i, i) = 0.5 * (h_ger_reference.d_view(i, i) + _KAT_A::conj(h_ger_reference.d_view(i, i))); } } @@ -1569,19 +1332,16 @@ int test_syr(const std::string& caseName) { #else int test_syr(const std::string& /*caseName*/) { #endif - bool xBool = std::is_same::value || - std::is_same::value || + bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool aBool = std::is_same::value || - std::is_same::value || + bool aBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; bool useAnalyticalResults = xBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1589,9 +1349,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1625,8 +1383,7 @@ int test_syr(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1634,9 +1391,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1670,8 +1425,7 @@ int test_syr(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1679,9 +1433,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(0, 0); tester.test(1, 0); tester.test(2, 0); @@ -1714,8 +1466,7 @@ int test_syr(const std::string& /*caseName*/) { #endif #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::printf( "+-----------------------------------------------------------------------" @@ -1723,9 +1474,7 @@ int test_syr(const std::string& /*caseName*/) { Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); #endif if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(1, 0); tester.test(2, 0); tester.test(1024, 0); @@ -1742,9 +1491,7 @@ int test_syr(const std::string& /*caseName*/) { } if (true) { - Test::SyrTester - tester; + Test::SyrTester tester; tester.test(1024, 0); } @@ -1766,8 +1513,7 @@ int test_syr(const std::string& /*caseName*/) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_float"); test_syr("test case syr_float"); @@ -1776,19 +1522,16 @@ TEST_F(TestCategory, syr_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_float"); - test_syr, Kokkos::complex, TestDevice>( - "test case syr_complex_float"); + test_syr, Kokkos::complex, TestDevice>("test case syr_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double"); test_syr("test case syr_double"); @@ -1797,19 +1540,16 @@ TEST_F(TestCategory, syr_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_double"); - test_syr, Kokkos::complex, TestDevice>( - "test case syr_complex_double"); + test_syr, Kokkos::complex, TestDevice>("test case syr_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int"); test_syr("test case syr_int"); @@ -1817,8 +1557,7 @@ TEST_F(TestCategory, syr_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int_float"); test_syr("test case syr_int_float"); diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index c49eba765b..2d6792f8c8 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -56,114 +56,91 @@ namespace Test { -template +template class Syr2Tester { public: Syr2Tester(); ~Syr2Tester(); - void test(const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults = false, - const bool useHermitianOption = false, - const bool useUpOption = false); + void test(const int N, const int nonConstConstCombinations, const bool useAnalyticalResults = false, + const bool useHermitianOption = false, const bool useUpOption = false); private: using _ViewTypeX = Kokkos::View; using _ViewTypeY = Kokkos::View; using _ViewTypeA = Kokkos::View; - using _HostViewTypeX = typename _ViewTypeX::HostMirror; - using _HostViewTypeY = typename _ViewTypeY::HostMirror; - using _HostViewTypeA = typename _ViewTypeA::HostMirror; - using _ViewTypeExpected = - Kokkos::View; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = Kokkos::View; using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown); + void populateVariables(ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, _ViewTypeExpected& h_vanilla); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareVanillaAgainstExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected); template - typename std::enable_if>::value || - std::is_same>::value, - void>::type - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template - typename std::enable_if>::value && - !std::is_same>::value, - void>::type - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference); + typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyr2AndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation); + void callKkSyr2AndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); template - void callKkGerAndCompareKkSyr2AgainstIt( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr2, const std::string& situation); + void callKkGerAndCompareKkSyr2AgainstIt(const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, const _HostViewTypeA& h_A_syr2, + const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -181,16 +158,13 @@ class Syr2Tester { bool _kkGerShouldThrowException; }; -template -Syr2Tester::Syr2Tester() +template +Syr2Tester::Syr2Tester() : _A_is_complex(std::is_same>::value || std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< - typename Device::execution_space>()) + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) @@ -207,12 +181,8 @@ Syr2Tester::value - ? 1.0e-6 - : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), - _relTol(std::is_same<_AuxType, float>::value - ? 5.0e-3 - : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -222,35 +192,26 @@ Syr2Tester -Syr2Tester::~Syr2Tester() { +template +Syr2Tester::~Syr2Tester() { // Nothing to do } -template -void Syr2Tester::test(const int N, const int nonConstConstCombinations, - const bool useAnalyticalResults, - const bool useHermitianOption, - const bool useUpOption) { +template +void Syr2Tester::test( + const int N, const int nonConstConstCombinations, const bool useAnalyticalResults, const bool useHermitianOption, + const bool useUpOption) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Entering Syr2Tester::test()... - - - - - - - - - - - - - - - - " "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - - - - " << std::endl; - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + std::cout << "_A_is_complex = " << _A_is_complex << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " - << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol - << ", _relTol = " << _relTol - << ", nonConstConstCombinations = " << nonConstConstCombinations - << ", useAnalyticalResults = " << useAnalyticalResults - << ", useHermitianOption = " << useHermitianOption + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults << ", useHermitianOption = " << useHermitianOption << ", useUpOption = " << useUpOption << std::endl; #endif // ******************************************************************** @@ -286,8 +247,7 @@ void Syr2Tester y("Y", _N); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected( - "expected A += alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * x^{t,h}", _M, _N); bool expectedResultIsKnown = false; using AlphaCoeffType = typename _ViewTypeA::non_const_value_type; @@ -296,20 +256,16 @@ void Syr2TesterpopulateVariables(alpha, x, y, A, h_expected.d_view, - expectedResultIsKnown); + this->populateVariables(alpha, x, y, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla( - "vanilla = A + alpha * x * x^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * x^{t,h}", _M, _N); #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = " - << typeid(alpha).name() << std::endl; + std::cout << "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = " << typeid(alpha).name() << std::endl; #endif - this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, - h_vanilla.d_view); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); // ******************************************************************** // Step 4 of 7: use h_vanilla and h_expected as appropriate @@ -318,8 +274,7 @@ void Syr2TestercompareVanillaAgainstExpected(alpha, h_vanilla.d_view, - h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -335,13 +290,11 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, - h_expected.d_view, "non const x"); + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { - this->callKkGerAndCompareKkSyr2AgainstIt(alpha, x.d_view, y.d_view, org_A, - A.h_view, "non const x"); + this->callKkGerAndCompareKkSyr2AgainstIt(alpha, x.d_view, y.d_view, org_A, A.h_view, "non const x"); } } @@ -351,24 +304,19 @@ void Syr2TestercallKkSyr2AndCompareAgainstExpected( - alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, "const x"); + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, "const x"); } // ******************************************************************** // Step 7 of 7: tests with invalid values on the first input parameter // ******************************************************************** - EXPECT_ANY_THROW( - KokkosBlas::syr2(".", "U", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2(".", "U", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for mode '.'"; - EXPECT_ANY_THROW( - KokkosBlas::syr2("", "U", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2("", "U", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for mode ''"; - EXPECT_ANY_THROW( - KokkosBlas::syr2("T", ".", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2("T", ".", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for uplo '.'"; - EXPECT_ANY_THROW( - KokkosBlas::syr2("T", "", alpha, x.d_view, y.d_view, A.d_view)) + EXPECT_ANY_THROW(KokkosBlas::syr2("T", "", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk syr2 should have thrown an exception for uplo ''"; #ifdef HAVE_KOKKOSKERNELS_DEBUG @@ -379,21 +327,14 @@ void Syr2Tester -void Syr2Tester< - ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, - Device>::populateVariables(ScalarA& alpha, - view_stride_adapter<_ViewTypeX, false>& x, - view_stride_adapter<_ViewTypeY, false>& y, - view_stride_adapter<_ViewTypeA, false>& A, - _ViewTypeExpected& h_expected, - bool& expectedResultIsKnown) { +template +void Syr2Tester::populateVariables( + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, - h_expected); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, h_expected); Kokkos::deep_copy(x.d_base, x.h_base); Kokkos::deep_copy(y.d_base, y.h_base); Kokkos::deep_copy(A.d_base, A.h_base); @@ -447,8 +388,7 @@ void Syr2Tester< } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); { ScalarX randStart, randEnd; @@ -502,8 +442,7 @@ void Syr2Tester< if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ") = " << A.h_view(i, j) - << std::endl; + std::cout << "h_origA(" << i << "," << j << ") = " << A.h_view(i, j) << std::endl; } } } @@ -511,17 +450,12 @@ void Syr2Tester< } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { alpha.real() = 1.4; alpha.imag() = -2.3; @@ -540,12 +474,9 @@ Syr2TestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_A(i, j).real() = sin(auxIpJ); h_A(i, j).imag() = -sin(auxImJ); } else { @@ -557,8 +488,7 @@ Syr2TestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_A(i, j).real() = sin(auxIpJ); h_A(i, j).imag() = sin(auxIpJ); } @@ -568,12 +498,9 @@ Syr2Tester= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - _AuxType auxImJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); h_expected(i, j).real() = 3.8 * sin(auxIpJ); h_expected(i, j).imag() = -5.6 * sin(auxImJ); } else { @@ -585,10 +512,8 @@ Syr2Tester= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j).real() = 5.6 * sin(auxIpJ); h_expected(i, j).imag() = 3.8 * sin(auxIpJ); } else { @@ -601,17 +526,12 @@ Syr2Tester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::populateAnalyticalValues( + T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, _ViewTypeExpected& h_expected) { alpha = std::is_same<_AuxType, int>::value ? 1 : 1.1; for (int i = 0; i < _M; ++i) { @@ -626,18 +546,15 @@ Syr2TestershrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); - h_A(i, j) = .1 * sin(auxIpJ); + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j) = .1 * sin(auxIpJ); } } for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { - _AuxType auxIpJ = - this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); h_expected(i, j) = 1.2 * sin(auxIpJ); } else { h_expected(i, j) = h_A(i, j); @@ -647,27 +564,20 @@ Syr2Tester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i) + - _KAT_A::conj(alpha) * _KAT_A::conj(h_x(j)) * h_y(i); + h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i) + _KAT_A::conj(alpha) * _KAT_A::conj(h_x(j)) * h_y(i); } else { h_vanilla(i, j) = h_A(i, j); } @@ -679,10 +589,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); } @@ -693,11 +601,9 @@ Syr2Tester= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + - _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); } else { h_vanilla(i, j) = h_A(i, j); } @@ -709,10 +615,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); } @@ -723,27 +627,20 @@ Syr2Tester +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester::populateVanillaValues(const T& alpha, - const _HostViewTypeX& h_x, - const _HostViewTypeY& h_y, - const _HostViewTypeA& h_A, - _ViewTypeExpected& h_vanilla) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_useHermitianOption) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(j) * _KAT_A::conj(h_y(i)) + - _KAT_A::conj(alpha) * h_y(j) * _KAT_A::conj(h_x(i)); + h_A(i, j) + alpha * h_x(j) * _KAT_A::conj(h_y(i)) + _KAT_A::conj(alpha) * h_y(j) * _KAT_A::conj(h_x(i)); } else { h_vanilla(i, j) = h_A(i, j); } @@ -752,11 +649,9 @@ Syr2Tester= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + - _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); } else { h_vanilla(i, j) = h_A(i, j); } @@ -767,10 +662,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); } else { h_vanilla(i, j) = h_A(i, j); } @@ -779,10 +672,8 @@ Syr2Tester= j))) { - h_vanilla(i, j) = - h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); } else { h_vanilla(i, j) = h_A(i, j); } @@ -792,11 +683,10 @@ Syr2Tester +template template -T Syr2Tester::shrinkAngleToZeroTwoPiRange(const T input) { +T Syr2Tester::shrinkAngleToZeroTwoPiRange( + const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -811,29 +701,23 @@ T Syr2Tester +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) - << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) << ", h_van(" << i << "," << j + << ") = " << h_vanilla(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsRealAbs(0); @@ -852,7 +736,7 @@ Syr2Tester:: for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -876,16 +760,14 @@ Syr2Tester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i, j).real() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << ", _KAT_A::abs(h_expected(i,j).real() - " "h_vanilla(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } - diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_absTol); @@ -909,13 +791,11 @@ Syr2Tester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << ", _KAT_A::abs(h_expected(i,j).imag() - " "h_vanilla(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -923,25 +803,15 @@ Syr2Tester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_vanilla(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -950,30 +820,19 @@ Syr2Tester:: std::cout << "WARNING" << msg.str() << std::endl; } #endif - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_vanilla(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -982,8 +841,7 @@ Syr2Tester:: std::cout << "WARNING" << msg.str() << std::endl; } #endif - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) - << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } else { int numErrorsReal(0); @@ -994,11 +852,8 @@ Syr2Tester:: if (h_expected(i, j).real() != h_vanilla(i, j).real()) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (numErrorsReal == 0) { - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).real() = " - << h_expected(i, j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; } #endif numErrorsReal++; @@ -1007,62 +862,49 @@ Syr2Tester:: if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (numErrorsImag == 0) { - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j).imag() = " - << h_expected(i, j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; } #endif numErrorsImag++; } } // for j } // for i - EXPECT_EQ(numErrorsReal, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester:: - compareVanillaAgainstExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::compareVanillaAgainstExpected( + const T& alpha, const _ViewTypeExpected& h_vanilla, const _ViewTypeExpected& h_expected) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) - << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) << ", h_van(" << i << "," << j + << ") = " << h_vanilla(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); if (_useAnalyticalResults) { int numErrorsAbs(0); @@ -1100,12 +942,10 @@ Syr2Tester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " - << diff << ", diffThreshold = " << diffThreshold - << std::endl; + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j @@ -1113,24 +953,14 @@ Syr2Tester:: { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_expected(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_vanilla(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_vanilla(i,j) = " << (((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1149,8 +979,7 @@ Syr2Tester:: if (h_expected(i, j) != h_vanilla(i, j)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (numErrors == 0) { - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i, j) + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; } #endif @@ -1158,41 +987,33 @@ Syr2Tester:: } } // for j } // for i - EXPECT_EQ(numErrors, 0) - << "Failed test" - << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } // Code for complex values -template +template template -typename std::enable_if>::value || - std::is_same>::value, - void>::type -Syr2Tester:: - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + std::is_same>::value || std::is_same>::value, void>::type +Syr2Tester::compareKkSyr2AgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) - << ", h_A(" << i << "," << j << ") = " << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) << ", h_A(" << i << "," << j + << ") = " << h_A(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsRealAbs(0); int numErrorsRealRel(0); @@ -1233,12 +1054,10 @@ Syr2Tester:: } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).real() = " << h_reference(i, j).real() - << ", h_A(i,j).real() = " << h_A(i, j).real() - << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); @@ -1265,95 +1084,58 @@ Syr2Tester:: } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() - << ", h_A(i,j).imag() = " << h_A(i, j).imag() - << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " - << diff << ", diffThreshold = " << diffThreshold << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout - << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() << ", " << h_reference(11, 2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " - << h_A(11, 2119).imag() << ")" << std::endl; + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() << ", " << h_reference(710, 1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " - << h_A(710, 1065).imag() << ")" << std::endl; + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " << h_A(710, 1065).imag() << ")" << std::endl; } #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr2 result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_reference(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", h_A(i,j).real() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); @@ -1366,25 +1148,15 @@ Syr2Tester:: } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ": syr2 result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_reference(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", h_A(i,j).imag() = " - << (((_M > 0) && (_N > 0)) - ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); @@ -1398,28 +1170,23 @@ Syr2Tester:: } // Code for non-complex values -template +template template -typename std::enable_if>::value && - !std::is_same>::value, - void>::type -Syr2Tester:: - compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_reference) { +typename std::enable_if< + !std::is_same>::value && !std::is_same>::value, void>::type +Syr2Tester::compareKkSyr2AgainstReference( + const T& alpha, const _HostViewTypeA& h_A, const _ViewTypeExpected& h_reference) { #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) - << ", h_A(" << i << "," << j << ") = " << h_A(i, j) - << std::endl; + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) << ", h_A(" << i << "," << j + << ") = " << h_A(i, j) << std::endl; } } } #endif - int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * - 1.e-3); + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); int numErrorsAbs(0); int numErrorsRel(0); @@ -1455,53 +1222,34 @@ Syr2Tester:: } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "ERROR, i = " << i << ", j = " << j - << ": h_reference(i,j) = " << h_reference(i, j) - << ", h_A(i,j) = " << h_A(i, j) - << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + std::cout << "ERROR, i = " << i << ", j = " << j << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; #endif } } // for j } // for i #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", _useUpOption = " << _useUpOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; #endif { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", numErrorsAbs = " << numErrorsAbs << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel << ", iForMaxErrorRel = " << iForMaxErrorRel << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " - << (((_M > 0) && (_N > 0)) - ? h_reference(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) - << ", h_A(i,j) = " - << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) - : 9.999e+99) + << (((_M > 0) && (_N > 0)) ? h_reference(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) + << ", h_A(i,j) = " << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); @@ -1514,22 +1262,16 @@ Syr2Tester:: } } -template +template template -void Syr2Tester:: - callKkSyr2AndCompareAgainstExpected( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& A, - const _ViewTypeExpected& h_expected, const std::string& situation) { +void Syr2Tester::callKkSyr2AndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& A, const _ViewTypeExpected& h_expected, + const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha - << std::endl; + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha << std::endl; std::cout << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::syr2()" << ": ViewTypeA = " << typeid(_ViewTypeA).name() - << ", _kkSyr2ShouldThrowException = " << _kkSyr2ShouldThrowException - << std::endl; + << ", _kkSyr2ShouldThrowException = " << _kkSyr2ShouldThrowException << std::endl; #endif std::string mode = _useHermitianOption ? "H" : "T"; std::string uplo = _useUpOption ? "U" : "L"; @@ -1540,25 +1282,21 @@ void Syr2Tester +template template -void Syr2Tester:: - callKkGerAndCompareKkSyr2AgainstIt( - const ScalarA& alpha, TX& x, TY& y, - view_stride_adapter<_ViewTypeA, false>& org_A, - const _HostViewTypeA& h_A_syr2, const std::string& situation) { +void Syr2Tester::callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, view_stride_adapter<_ViewTypeA, false>& org_A, const _HostViewTypeA& h_A_syr2, + const std::string& situation) { view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); Kokkos::deep_copy(A_ger.d_base, org_A.d_base); @@ -1583,12 +1317,10 @@ void Syr2Tester h_ger_reference( - "h_ger_reference", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference("h_ger_reference", _M, _N); Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); Kokkos::deep_copy(h_ger_reference.h_base, h_ger_reference.d_base); std::string uplo = _useUpOption ? "U" : "L"; for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - if (((_useUpOption == true) && (i <= j)) || - ((_useUpOption == false) && (i >= j))) { + if (((_useUpOption == true) && (i <= j)) || ((_useUpOption == false) && (i >= j))) { // Keep h_ger_reference as already computed } else { h_ger_reference.h_view(i, j) = org_A.h_view(i, j); @@ -1677,9 +1398,7 @@ void Syr2Tester::value || - std::is_same::value || + bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool yBool = std::is_same::value || - std::is_same::value || + bool yBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; - bool aBool = std::is_same::value || - std::is_same::value || + bool aBool = std::is_same::value || std::is_same::value || std::is_same>::value || std::is_same>::value; bool useAnalyticalResults = xBool && yBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1725,8 +1440,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for LAYOUTLEFT ..." << std::endl; #endif if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(0, 0); tester.test(1, 0); @@ -1761,8 +1475,7 @@ int test_syr2(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1770,8 +1483,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for LAYOUTRIGHT ..." << std::endl; #endif if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(0, 0); tester.test(1, 0); @@ -1806,8 +1518,7 @@ int test_syr2(const std::string& /*caseName*/) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1815,8 +1526,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for LAYOUTSTRIDE ..." << std::endl; #endif if (true) { - Test::Syr2Tester tester; tester.test(0, 0); @@ -1851,8 +1561,7 @@ int test_syr2(const std::string& /*caseName*/) { #endif #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "+--------------------------------------------------------------" "------------" @@ -1860,8 +1569,7 @@ int test_syr2(const std::string& /*caseName*/) { std::cout << "Starting " << caseName << " for MIXED LAYOUTS ..." << std::endl; #endif if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(1, 0); tester.test(2, 0); @@ -1879,8 +1587,7 @@ int test_syr2(const std::string& /*caseName*/) { } if (true) { - Test::Syr2Tester + Test::Syr2Tester tester; tester.test(1024, 0); } @@ -1903,8 +1610,7 @@ int test_syr2(const std::string& /*caseName*/) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_float"); test_syr2("test case syr2_float"); @@ -1913,19 +1619,17 @@ TEST_F(TestCategory, syr2_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_float"); - test_syr2, Kokkos::complex, - Kokkos::complex, TestDevice>("test case syr2_complex_float"); + test_syr2, Kokkos::complex, Kokkos::complex, TestDevice>( + "test case syr2_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_double"); test_syr2("test case syr2_double"); @@ -1934,20 +1638,17 @@ TEST_F(TestCategory, syr2_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_double"); - test_syr2, Kokkos::complex, - Kokkos::complex, TestDevice>( + test_syr2, Kokkos::complex, Kokkos::complex, TestDevice>( "test case syr2_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, syr2_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int"); test_syr2("test case syr2_int"); @@ -1955,8 +1656,7 @@ TEST_F(TestCategory, syr2_int) { } #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, syr2_int_float_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int_float_double"); test_syr2("test case syr2_mixed_types"); diff --git a/blas/unit_test/Test_Blas2_team_gemv.hpp b/blas/unit_test/Test_Blas2_team_gemv.hpp index 808532a98e..851410fdb7 100644 --- a/blas/unit_test/Test_Blas2_team_gemv.hpp +++ b/blas/unit_test/Test_Blas2_team_gemv.hpp @@ -27,37 +27,30 @@ namespace Test { -template +template struct TeamGEMVOp : public GemvOpBase { using params = GemvOpBase; - TeamGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + TeamGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { KokkosBlas::Experimental::Gemv::invoke( - member, params::trans, params::alpha, params::A, params::x, - params::beta, params::y); + member, params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; struct TeamGemvFactory { - template - using functor_type = - TeamGEMVOp; + template + using functor_type = TeamGEMVOp; - using algorithms = std::tuple; + using algorithms = std::tuple; }; } // namespace Test -#define TEST_TEAM_CASE4(N, A, X, Y, SC) \ - TEST_CASE4(team, TeamGemvFactory, N, A, X, Y, SC) +#define TEST_TEAM_CASE4(N, A, X, Y, SC) TEST_CASE4(team, TeamGemvFactory, N, A, X, Y, SC) #define TEST_TEAM_CASE2(N, S, SC) TEST_CASE2(team, TeamGemvFactory, N, S, SC) #define TEST_TEAM_CASE(N, S) TEST_CASE(team, TeamGemvFactory, N, S) diff --git a/blas/unit_test/Test_Blas2_teamvector_gemv.hpp b/blas/unit_test/Test_Blas2_teamvector_gemv.hpp index 655a5e2f12..74cdebf062 100644 --- a/blas/unit_test/Test_Blas2_teamvector_gemv.hpp +++ b/blas/unit_test/Test_Blas2_teamvector_gemv.hpp @@ -27,30 +27,23 @@ namespace Test { -template +template struct TeamVectorGEMVOp : public GemvOpBase { using params = GemvOpBase; - TeamVectorGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) + TeamVectorGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) : params(trans_, alpha_, A_, x_, beta_, y_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { - KokkosBlas::Experimental::Gemv::invoke(member, params::trans, - params::alpha, params::A, - params::x, params::beta, - params::y); + KokkosBlas::Experimental::Gemv::invoke( + member, params::trans, params::alpha, params::A, params::x, params::beta, params::y); } }; struct TeamVectorGemvFactory { - template - using functor_type = - TeamVectorGEMVOp; + template + using functor_type = TeamVectorGEMVOp; // no Blocked implementation using algorithms = std::tuple; @@ -58,12 +51,9 @@ struct TeamVectorGemvFactory { } // namespace Test -#define TEST_TEAMVECTOR_CASE4(N, A, X, Y, SC) \ - TEST_CASE4(teamvector, TeamVectorGemvFactory, N, A, X, Y, SC) -#define TEST_TEAMVECTOR_CASE2(N, S, SC) \ - TEST_CASE2(teamvector, TeamVectorGemvFactory, N, S, SC) -#define TEST_TEAMVECTOR_CASE(N, S) \ - TEST_CASE(teamvector, TeamVectorGemvFactory, N, S) +#define TEST_TEAMVECTOR_CASE4(N, A, X, Y, SC) TEST_CASE4(teamvector, TeamVectorGemvFactory, N, A, X, Y, SC) +#define TEST_TEAMVECTOR_CASE2(N, S, SC) TEST_CASE2(teamvector, TeamVectorGemvFactory, N, S, SC) +#define TEST_TEAMVECTOR_CASE(N, S) TEST_CASE(teamvector, TeamVectorGemvFactory, N, S) #ifdef KOKKOSKERNELS_TEST_FLOAT TEST_TEAMVECTOR_CASE(float, float) diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index cd91bc6d95..d56886cf13 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -23,8 +23,7 @@ namespace Test { -template +template struct gemm_VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -41,12 +40,9 @@ struct gemm_VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -77,10 +73,8 @@ struct gemm_VanillaGEMM { }; template -void build_matrices(const int M, const int N, const int K, - const typename ViewTypeA::value_type alpha, ViewTypeA& A, - ViewTypeB& B, const typename ViewTypeA::value_type beta, - ViewTypeC& C, ViewTypeC& Cref) { +void build_matrices(const int M, const int N, const int K, const typename ViewTypeA::value_type alpha, ViewTypeA& A, + ViewTypeB& B, const typename ViewTypeA::value_type beta, ViewTypeC& C, ViewTypeC& Cref) { using execution_space = typename TestDevice::execution_space; using ScalarA = typename ViewTypeA::non_const_value_type; using ScalarB = typename ViewTypeB::non_const_value_type; @@ -93,28 +87,22 @@ void build_matrices(const int M, const int N, const int K, // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in // Trilinos issue #6418 - const uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); - Kokkos::fill_random(A, rand_pool, - Kokkos::rand::generator_type, - ScalarA>::max()); - Kokkos::fill_random(B, rand_pool, - Kokkos::rand::generator_type, - ScalarB>::max()); - Kokkos::fill_random(C, rand_pool, - Kokkos::rand::generator_type, - ScalarC>::max()); + Kokkos::fill_random( + A, rand_pool, + Kokkos::rand::generator_type, ScalarA>::max()); + Kokkos::fill_random( + B, rand_pool, + Kokkos::rand::generator_type, ScalarB>::max()); + Kokkos::fill_random( + C, rand_pool, + Kokkos::rand::generator_type, ScalarC>::max()); Kokkos::deep_copy(Cref, C); Kokkos::fence(); - struct Test::gemm_VanillaGEMM - vgemm; + struct Test::gemm_VanillaGEMM vgemm; vgemm.A_t = false; vgemm.B_t = false; vgemm.A_c = false; @@ -127,12 +115,10 @@ void build_matrices(const int M, const int N, const int K, vgemm.alpha = alpha; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::gemm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); } @@ -146,9 +132,7 @@ struct DiffGEMM { typedef typename APT::mag_type mag_type; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team, - mag_type& diff) const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team, mag_type& diff) const { const int i = team.league_rank(); mag_type diff_row = 0; Kokkos::parallel_reduce( @@ -166,8 +150,7 @@ struct DiffGEMM { }; template -void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, - typename ViewTypeA::value_type alpha, +void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, typename ViewTypeA::value_type alpha, typename ViewTypeC::value_type beta) { bool A_t = (TA[0] != 'N') && (TA[0] != 'n'); bool B_t = (TB[0] != 'N') && (TB[0] != 'n'); @@ -187,30 +170,25 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, ViewTypeC C("C", M, N); ViewTypeC C2("C", M, N); - const uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in // Trilinos issue #6418 - Kokkos::fill_random(A, rand_pool, - Kokkos::rand::generator_type, - ScalarA>::max()); - Kokkos::fill_random(B, rand_pool, - Kokkos::rand::generator_type, - ScalarB>::max()); - Kokkos::fill_random(C, rand_pool, - Kokkos::rand::generator_type, - ScalarC>::max()); + Kokkos::fill_random( + A, rand_pool, + Kokkos::rand::generator_type, ScalarA>::max()); + Kokkos::fill_random( + B, rand_pool, + Kokkos::rand::generator_type, ScalarB>::max()); + Kokkos::fill_random( + C, rand_pool, + Kokkos::rand::generator_type, ScalarC>::max()); Kokkos::deep_copy(C2, C); Kokkos::fence(); - struct gemm_VanillaGEMM - vgemm; + struct gemm_VanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; @@ -223,12 +201,10 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, vgemm.alpha = alpha; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::gemm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); KokkosBlas::gemm(TA, TB, alpha, A, B, beta, C); @@ -238,9 +214,8 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, diffgemm.C = C; diffgemm.C2 = C2; - Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM", - Kokkos::TeamPolicy(M, Kokkos::AUTO), - diffgemm, diff_C); + Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM", Kokkos::TeamPolicy(M, Kokkos::AUTO), diffgemm, + diff_C); if (N != 0 && M != 0) { int K_eff = (K == 0) ? 1 : K; @@ -258,8 +233,7 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, } template -void impl_test_stream_gemm_psge2(const int M, const int N, const int K, - const Scalar alpha, const Scalar beta) { +void impl_test_stream_gemm_psge2(const int M, const int N, const int K, const Scalar alpha, const Scalar beta) { using execution_space = typename Device::execution_space; using ViewTypeA = Kokkos::View; using ViewTypeB = Kokkos::View; @@ -279,8 +253,7 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, Test::build_matrices(M, N, K, alpha, A1, B1, beta, C1, C1ref); Test::build_matrices(N, M, K, alpha, A2, B2, beta, C2, C2ref); - auto instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1); + auto instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); KokkosBlas::gemm(instances[0], tA, tB, alpha, A1, B1, beta, C1); KokkosBlas::gemm(instances[1], tA, tB, alpha, A2, B2, beta, C2); Kokkos::fence(); @@ -291,12 +264,10 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, diffgemm1.C = C1; diffgemm1.C2 = C1ref; - Kokkos::parallel_reduce( - "KokkosBlas::Test::DiffGEMM1", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - diffgemm1, diff_C1); + Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM1", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + diffgemm1, diff_C1); mag_type diff_C2 = 0; struct Test::DiffGEMM diffgemm2; @@ -304,12 +275,10 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, diffgemm2.C = C2; diffgemm2.C2 = C2ref; - Kokkos::parallel_reduce( - "KokkosBlas::Test::DiffGEMM2", - Kokkos::TeamPolicy( - N, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - diffgemm2, diff_C2); + Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM2", + Kokkos::TeamPolicy( + N, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + diffgemm2, diff_C2); Kokkos::fence(); if (N != 0 && M != 0) { @@ -317,8 +286,7 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, // Expected Result: Random Walk in the least significant bit (i.e. ~ // sqrt(K)*eps eps scales with the total sum and has a factor in it for the // accuracy of the operations -> eps = K * 75 * machine_eps * 7 - const double diff_C_expected = - 1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7; + const double diff_C_expected = 1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7; const double diff_C1_average = diff_C1 / (N * M); if ((diff_C1_average >= 1.05 * diff_C_expected)) { @@ -342,55 +310,45 @@ void test_gemm() { typedef Kokkos::View view_type_b; typedef Kokkos::View view_type_c; std::vector modes = {"N", "T"}; - if (std::is_same>::value || - std::is_same>::value) + if (std::is_same>::value || std::is_same>::value) modes.push_back("C"); Scalar alpha = 4.5; std::vector betas = {0.0, 3.0}; for (Scalar beta : betas) { for (auto amode : modes) { for (auto bmode : modes) { - Test::impl_test_gemm( - amode, bmode, 0, 0, 0, alpha, beta); + Test::impl_test_gemm(amode, bmode, 0, 0, 0, alpha, beta); // BMK: N = 1 exercises the special GEMV code path in GEMM (currently, // only for modes N/N) - Test::impl_test_gemm( - amode, bmode, 50, 1, 40, alpha, beta); + Test::impl_test_gemm(amode, bmode, 50, 1, 40, alpha, beta); // LBV: K = 0 exercise the quick return code path in GEMM - Test::impl_test_gemm( - amode, bmode, 20, 14, 0, alpha, beta); - Test::impl_test_gemm( - amode, bmode, 13, 15, 17, alpha, beta); - Test::impl_test_gemm( - amode, bmode, 179, 15, 211, alpha, beta); - Test::impl_test_gemm( - amode, bmode, 12, 3071, 517, alpha, beta); + Test::impl_test_gemm(amode, bmode, 20, 14, 0, alpha, beta); + Test::impl_test_gemm(amode, bmode, 13, 15, 17, alpha, beta); + Test::impl_test_gemm(amode, bmode, 179, 15, 211, alpha, + beta); + Test::impl_test_gemm(amode, bmode, 12, 3071, 517, alpha, + beta); } } } auto pool_size = execution_space().concurrency(); if (pool_size >= 2) { - Test::impl_test_stream_gemm_psge2( - 53, 42, 17, 4.5, - 3.0); // General code path - Test::impl_test_stream_gemm_psge2( - 13, 1, 17, 4.5, 3.0); // gemv based gemm code path - Test::impl_test_stream_gemm_psge2( - 7, 13, 17, 4.5, - 3.0); // dot based gemm code path + Test::impl_test_stream_gemm_psge2(53, 42, 17, 4.5, + 3.0); // General code path + Test::impl_test_stream_gemm_psge2(13, 1, 17, 4.5, 3.0); // gemv based gemm code path + Test::impl_test_stream_gemm_psge2(7, 13, 17, 4.5, + 3.0); // dot based gemm code path } } template void test_gemm_enabled_layouts() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) test_gemm(); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) test_gemm(); #endif } @@ -416,8 +374,7 @@ void test_gemm_mixed_scalars() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_float"); test_gemm_enabled_layouts(); @@ -426,8 +383,7 @@ TEST_F(TestCategory, gemm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_double"); test_gemm_enabled_layouts(); @@ -436,8 +392,7 @@ TEST_F(TestCategory, gemm_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_double"); test_gemm_enabled_layouts>(); @@ -446,8 +401,7 @@ TEST_F(TestCategory, gemm_complex_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemm_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_float"); test_gemm_enabled_layouts>(); @@ -455,21 +409,17 @@ TEST_F(TestCategory, gemm_complex_float) { } #endif -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) && \ - !defined(KOKKOSKERNELS_ETI_ONLY) +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) && !defined(KOKKOSKERNELS_ETI_ONLY) TEST_F(TestCategory, gemm_mixed_scalars_complex_double_double) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::gemm_mixed_complex_double_double"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_double_double"); test_gemm_mixed_scalars, double>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) && \ - !defined(KOKKOSKERNELS_ETI_ONLY) +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) && !defined(KOKKOSKERNELS_ETI_ONLY) TEST_F(TestCategory, gemm_mixed_scalar_complex_float_float) { - Kokkos::Profiling::pushRegion( - "KokkosBlas::Test::gemm_mixed_complex_float_float"); + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_float_float"); test_gemm_mixed_scalars, float>(); Kokkos::Profiling::popRegion(); } diff --git a/blas/unit_test/Test_Blas3_trmm.hpp b/blas/unit_test/Test_Blas3_trmm.hpp index a186835aaa..d5ba622969 100644 --- a/blas/unit_test/Test_Blas3_trmm.hpp +++ b/blas/unit_test/Test_Blas3_trmm.hpp @@ -44,8 +44,7 @@ struct NonUnitDiagTRMM { void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct trmm_VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -62,12 +61,9 @@ struct trmm_VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -98,8 +94,8 @@ struct trmm_VanillaGEMM { }; template -void impl_test_trmm(const char* side, const char* uplo, const char* trans, - const char* diag, int M, int N, Scalar alpha) { +void impl_test_trmm(const char* side, const char* uplo, const char* trans, const char* diag, int M, int N, + Scalar alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; using APT = Kokkos::ArithTraits; @@ -112,45 +108,35 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, ViewTypeA A("A", K, K); ViewTypeB B("B", M, N); ViewTypeB B_expected("B_expected", M, N); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); - ScalarA beta = ScalarA(0); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); // printf("KokkosBlas::trmm test for alpha %g, %c %c %c %c, M %d, N %d, eps // %g, ViewType: %s\n", // Kokkos::ArithTraits::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name()); - typename ViewTypeA::HostMirror host_A = Kokkos::create_mirror_view(A); - typename ViewTypeB::HostMirror host_B_actual = Kokkos::create_mirror_view(B); - typename ViewTypeB::HostMirror host_B_expected = - Kokkos::create_mirror_view(B_expected); + typename ViewTypeA::HostMirror host_A = Kokkos::create_mirror_view(A); + typename ViewTypeB::HostMirror host_B_actual = Kokkos::create_mirror_view(B); + typename ViewTypeB::HostMirror host_B_expected = Kokkos::create_mirror_view(B_expected); Kokkos::Random_XorShift64_Pool rand_pool(seed); if ((diag[0] == 'U') || (diag[0] == 'u')) { // Initialize A with deterministic random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); using functor_type = UnitDiagTRMM; functor_type udtrmm(A); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", - Kokkos::RangePolicy(0, K), udtrmm); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", Kokkos::RangePolicy(0, K), udtrmm); } else { //(diag[0]=='N')||(diag[0]=='n') // Initialize A with random numbers - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); using functor_type = NonUnitDiagTRMM; functor_type nudtrmm(A); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", - Kokkos::RangePolicy(0, K), nudtrmm); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy(0, K), nudtrmm); } - Kokkos::fill_random( - B, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, ScalarA>::max()); Kokkos::deep_copy(host_A, A); // Make host_A a lower triangle @@ -164,8 +150,7 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, } Kokkos::deep_copy(A, host_A); - struct trmm_VanillaGEMM - vgemm; + struct trmm_VanillaGEMM vgemm; if (A_l) { // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n'); @@ -188,12 +173,10 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, vgemm.C = B_expected; // out vgemm.alpha = alpha; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::trmm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::trmm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); Kokkos::deep_copy(host_B_expected, B_expected); @@ -221,41 +204,38 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, template int test_trmm(const char* mode, ScalarA alpha) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 12, 731, alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 12, 731, + alpha); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_lr = Kokkos::View; using view_type_b_lr = Kokkos::View; - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trmm( - &mode[0], &mode[1], &mode[2], &mode[3], 12, 731, alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, + alpha); + Test::impl_test_trmm(&mode[0], &mode[1], &mode[2], &mode[3], 12, 731, + alpha); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trmm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_float"); float alpha = 1.0f; @@ -300,8 +280,7 @@ TEST_F(TestCategory, trmm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trmm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_double"); double alpha = 1.0; @@ -346,399 +325,333 @@ TEST_F(TestCategory, trmm_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("LUCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCN", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCU", 1.0); + test_trmm, Kokkos::complex, TestDevice>("RUCU", 1.0); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("LUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCN", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCU", Kokkos::complex(4.5, 0.0)); + test_trmm, Kokkos::complex, TestDevice>("RUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestDevice>("LLNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestDevice>("LLNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestDevice>("LLCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestDevice>("LLCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestDevice>("LUNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestDevice>("LUNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestDevice>("LUCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestDevice>("LUCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestDevice>("RLNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestDevice>("RLNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestDevice>("RLCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestDevice>("RLCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestDevice>("RUNN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestDevice>("RUNU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNU", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestDevice>("RUCN", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCN", 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestDevice>("RUCU", - 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCU", 1.0f); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LLCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "LUCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("LUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RLCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUNU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCN", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestDevice>( - "RUCU", Kokkos::complex(4.5f, 0.0f)); + test_trmm, Kokkos::complex, TestDevice>("RUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas3_trsm.hpp b/blas/unit_test/Test_Blas3_trsm.hpp index 9a00f22263..81fdad8929 100644 --- a/blas/unit_test/Test_Blas3_trsm.hpp +++ b/blas/unit_test/Test_Blas3_trsm.hpp @@ -44,8 +44,7 @@ struct NonUnitDiagTRSM { void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct trsm_VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -62,12 +61,9 @@ struct trsm_VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else @@ -99,8 +95,7 @@ struct trsm_VanillaGEMM { }; template -void impl_test_trsm(const char* side, const char* uplo, const char* trans, - const char* diag, int M, int N, +void impl_test_trsm(const char* side, const char* uplo, const char* trans, const char* diag, int M, int N, typename ViewTypeA::value_type alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; @@ -123,31 +118,21 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror_view(B); typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0); - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); if ((diag[0] == 'U') || (diag[0] == 'u')) { - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max() * - 0.1); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max() * 0.1); using functor_type = UnitDiagTRSM; functor_type udtrsm(A); - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRSM", - Kokkos::RangePolicy(0, K), udtrsm); + Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRSM", Kokkos::RangePolicy(0, K), udtrsm); } else { //(diag[0]=='N')||(diag[0]=='n') - Kokkos::fill_random(A, rand_pool, - Kokkos::rand, - ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); using functor_type = NonUnitDiagTRSM; functor_type nudtrsm(A); - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRSM", - Kokkos::RangePolicy(0, K), nudtrsm); + Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRSM", Kokkos::RangePolicy(0, K), nudtrsm); } - Kokkos::fill_random( - X0, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(X0, rand_pool, Kokkos::rand, ScalarA>::max()); Kokkos::deep_copy(h_A, A); Kokkos::deep_copy(h_X0, X0); @@ -165,8 +150,7 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, Kokkos::deep_copy(A, h_A); - struct trsm_VanillaGEMM - vgemm; + struct trsm_VanillaGEMM vgemm; if (A_l) { vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n'); vgemm.B_t = false; @@ -187,12 +171,10 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, vgemm.C = B; vgemm.alpha = alpha_trmm; vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosBlas::Test::trsm_VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosBlas::Test::trsm_VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); KokkosBlas::trsm(side, uplo, trans, diag, alpha, A, B); @@ -223,41 +205,30 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, template int test_trsm(const char* mode, ScalarA alpha) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_lr = Kokkos::View; using view_type_b_lr = Kokkos::View; - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); - Test::impl_test_trsm( - &mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha); + Test::impl_test_trsm(&mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha); #endif return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_float"); float alpha = 1.0f; @@ -302,8 +273,7 @@ TEST_F(TestCategory, trsm_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_double"); double alpha = 1.0; @@ -348,157 +318,91 @@ TEST_F(TestCategory, trsm_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_double"); Kokkos::complex alpha = 1.0; - test_trsm, Kokkos::complex, TestDevice>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestDevice>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); alpha = Kokkos::complex(4.5, 0.0); - test_trsm, Kokkos::complex, TestDevice>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestDevice>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestDevice>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trsm_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_float"); Kokkos::complex alpha = 1.0f; - test_trsm, Kokkos::complex, TestDevice>("LLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCU", - alpha); - - test_trsm, Kokkos::complex, TestDevice>("RLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCU", - alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); alpha = Kokkos::complex(4.5f, 0.0f); - test_trsm, Kokkos::complex, TestDevice>("LLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("LUCU", - alpha); - - test_trsm, Kokkos::complex, TestDevice>("RLNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RLCU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUNU", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCN", - alpha); - test_trsm, Kokkos::complex, TestDevice>("RUCU", - alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", alpha); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas_Newton.hpp b/blas/unit_test/Test_Blas_Newton.hpp index 5bb6946e99..7b6d4a9049 100644 --- a/blas/unit_test/Test_Blas_Newton.hpp +++ b/blas/unit_test/Test_Blas_Newton.hpp @@ -40,16 +40,13 @@ struct LogisticEquation { scalar_type dt; vec_type state; - LogisticEquation(const scalar_type dt_, vec_type initial_state) - : dt(dt_), state(initial_state) {} + LogisticEquation(const scalar_type dt_, vec_type initial_state) : dt(dt_), state(initial_state) {} KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const { dydt(0) = y(0) - state(0) - dt * y(0) * (1 - y(0)); } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = 1 - dt + 2 * dt * y(0); - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = 1 - dt + 2 * dt * y(0); } KOKKOS_FUNCTION scalar_type expected_val(const scalar_type t) const { using Kokkos::exp; @@ -112,9 +109,7 @@ int test_logistic() { using norm_type = typename Kokkos::View; using handle_type = KokkosBlas::Impl::NewtonHandle; using system_type = LogisticEquation; - using newton_type = - KokkosBlas::Impl::NewtonFunctor; + using newton_type = KokkosBlas::Impl::NewtonFunctor; // Create the non-linear system and initialize data vec_type state("state", 1); @@ -150,9 +145,7 @@ int test_intersection() { using norm_type = typename Kokkos::View; using handle_type = KokkosBlas::Impl::NewtonHandle; using system_type = Intersection; - using newton_type = - KokkosBlas::Impl::NewtonFunctor; + using newton_type = KokkosBlas::Impl::NewtonFunctor; // Create the non-linear system and initialize data system_type intersection; diff --git a/blas/unit_test/Test_Blas_rocblas.hpp b/blas/unit_test/Test_Blas_rocblas.hpp index ed68b7a8b6..091fac7259 100644 --- a/blas/unit_test/Test_Blas_rocblas.hpp +++ b/blas/unit_test/Test_Blas_rocblas.hpp @@ -58,8 +58,7 @@ void test_rocblas_safe_call() { // fails it throws an error with the // KOKKOS_ROCBLAS_SAFE_CALL_IMPL macro void test_rocblas_singleton() { - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); (void)s; } diff --git a/blas/unit_test/Test_Blas_serial_axpy.hpp b/blas/unit_test/Test_Blas_serial_axpy.hpp index 427925a3dc..cd58eba920 100644 --- a/blas/unit_test/Test_Blas_serial_axpy.hpp +++ b/blas/unit_test/Test_Blas_serial_axpy.hpp @@ -29,8 +29,7 @@ namespace Test { struct KokkosKernelAxpyTag {}; struct NaiveAxpyTag {}; -template +template struct Functor_TestBlasSerialAxpy { using execution_space = typename DeviceType::execution_space; ScalarType _alpha; @@ -38,8 +37,7 @@ struct Functor_TestBlasSerialAxpy { ViewType _y; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialAxpy(const ScalarType alpha, const ViewType &x, - const ViewType &y) + Functor_TestBlasSerialAxpy(const ScalarType alpha, const ViewType &x, const ViewType &y) : _alpha(alpha), _x(x), _y(y) {} KOKKOS_INLINE_FUNCTION @@ -62,15 +60,11 @@ struct Functor_TestBlasSerialAxpy { using value_type = typename ViewType::value_type; std::string name_region("KokkosBlas::Test::SerialAxpy"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value - ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = "Axpy"; - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Axpy"; + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); @@ -91,20 +85,15 @@ void impl_test_blas_serial_axpy(const int N, const int BlkSize) { ViewType Y("Y", N, BlkSize, BlkSize); ViewType Yref("Yref", N, BlkSize, BlkSize); - Kokkos::Random_XorShift64_Pool random( - 13718); + Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(X, random, ats::one()); Kokkos::fill_random(Y, random, ats::one()); Kokkos::fence(); Kokkos::deep_copy(Yref, Y); /// test body - Functor_TestBlasSerialAxpy( - alpha, X, Yref) - .run(); - Functor_TestBlasSerialAxpy(alpha, X, Y) - .run(); + Functor_TestBlasSerialAxpy(alpha, X, Yref).run(); + Functor_TestBlasSerialAxpy(alpha, X, Y).run(); Kokkos::fence(); @@ -116,12 +105,10 @@ void impl_test_blas_serial_axpy(const int N, const int BlkSize) { Kokkos::deep_copy(Yref_host, Yref); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(Y_host(k, i, j), Yref_host(k, i, j), eps); + for (int j = 0; j < BlkSize; ++j) EXPECT_NEAR_KK(Y_host(k, i, j), Yref_host(k, i, j), eps); } } // namespace Test @@ -130,24 +117,20 @@ template int test_blas_serial_axpy() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_blas_serial_axpy(0, 10); Test::impl_test_blas_serial_axpy(10, 15); Test::impl_test_blas_serial_axpy(1024, 9); - Test::impl_test_blas_serial_axpy(132231, - 3); + Test::impl_test_blas_serial_axpy(132231, 3); } #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - typedef Kokkos::View - ViewType; + typedef Kokkos::View ViewType; Test::impl_test_blas_serial_axpy(0, 10); Test::impl_test_blas_serial_axpy(10, 15); Test::impl_test_blas_serial_axpy(1024, 9); - Test::impl_test_blas_serial_axpy(132231, - 3); + Test::impl_test_blas_serial_axpy(132231, 3); } #endif @@ -155,21 +138,16 @@ int test_blas_serial_axpy() { } #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, serial_axpy_float_float) { - test_blas_serial_axpy(); -} +TEST_F(TestCategory, serial_axpy_float_float) { test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, serial_axpy_double_double) { - test_blas_serial_axpy(); -} +TEST_F(TestCategory, serial_axpy_double_double) { test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, serial_axpy_dcomplex_dcomplex) { - test_blas_serial_axpy, - Kokkos::complex >(); + test_blas_serial_axpy, Kokkos::complex >(); } TEST_F(TestCategory, serial_axpy_dcomplex_double) { @@ -179,13 +157,10 @@ TEST_F(TestCategory, serial_axpy_dcomplex_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, serial_axpy_fcomplex_fcomplex) { - test_blas_serial_axpy, - Kokkos::complex >(); + test_blas_serial_axpy, Kokkos::complex >(); } -TEST_F(TestCategory, serial_axpy_fcomplex_float) { - test_blas_serial_axpy, float>(); -} +TEST_F(TestCategory, serial_axpy_fcomplex_float) { test_blas_serial_axpy, float>(); } #endif #endif // TEST_BLAS_SERIAL_AXPY_HPP_ diff --git a/blas/unit_test/Test_Blas_serial_nrm2.hpp b/blas/unit_test/Test_Blas_serial_nrm2.hpp index 147df52353..bca8afa1f3 100644 --- a/blas/unit_test/Test_Blas_serial_nrm2.hpp +++ b/blas/unit_test/Test_Blas_serial_nrm2.hpp @@ -38,8 +38,7 @@ struct Functor_TestBlasSerialNrm2 { norm_view_type _nrm; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialNrm2(const ViewType &x, const norm_view_type &nrm) - : _x(x), _nrm(nrm) {} + Functor_TestBlasSerialNrm2(const ViewType &x, const norm_view_type &nrm) : _x(x), _nrm(nrm) {} KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const int i) const { @@ -61,14 +60,11 @@ struct Functor_TestBlasSerialNrm2 { inline void run() { std::string name_region("KokkosBlas::Test::SerialNrm2"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = "Nrm2"; - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); @@ -89,8 +85,7 @@ struct Functor_TestBlasSerialNrm2MV { norm_view_type _nrm; KOKKOS_INLINE_FUNCTION - Functor_TestBlasSerialNrm2MV(const ViewType &x, const norm_view_type &nrm) - : _x(x), _nrm(nrm) {} + Functor_TestBlasSerialNrm2MV(const ViewType &x, const norm_view_type &nrm) : _x(x), _nrm(nrm) {} KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, const int i) const { @@ -116,14 +111,11 @@ struct Functor_TestBlasSerialNrm2MV { inline void run() { std::string name_region("KokkosBlas::Test::SerialNrm2MV"); const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = "Nrm2"; - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; + std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); @@ -153,31 +145,24 @@ void impl_test_blas_serial_nrm2(const int N, const int BlkSize) { /// test body Functor_TestBlasSerialNrm2(X, norms).run(); - Functor_TestBlasSerialNrm2(X, - norms_ref) - .run(); + Functor_TestBlasSerialNrm2(X, norms_ref).run(); Kokkos::fence(); /// for comparison send it to host - typename norm_view_type::HostMirror norms_host = - Kokkos::create_mirror_view(norms); - typename norm_view_type::HostMirror norms_ref_host = - Kokkos::create_mirror_view(norms_ref); + typename norm_view_type::HostMirror norms_host = Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = Kokkos::create_mirror_view(norms_ref); Kokkos::deep_copy(norms_host, norms); Kokkos::deep_copy(norms_ref_host, norms_ref); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); - for (int k = 0; k < N; ++k) - EXPECT_NEAR_KK(norms_host(k), norms_ref_host(k), eps); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) EXPECT_NEAR_KK(norms_host(k), norms_ref_host(k), eps); } template -void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, - const int numVecs) { +void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, const int numVecs) { /// typedefs using execution_space = typename DeviceType::execution_space; using value_type = typename ViewType::non_const_value_type; @@ -197,24 +182,19 @@ void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, /// test body Functor_TestBlasSerialNrm2MV(X, norms).run(); - Functor_TestBlasSerialNrm2MV(X, - norms_ref) - .run(); + Functor_TestBlasSerialNrm2MV(X, norms_ref).run(); Kokkos::fence(); /// for comparison send it to host - typename norm_view_type::HostMirror norms_host = - Kokkos::create_mirror_view(norms); - typename norm_view_type::HostMirror norms_ref_host = - Kokkos::create_mirror_view(norms_ref); + typename norm_view_type::HostMirror norms_host = Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = Kokkos::create_mirror_view(norms_ref); Kokkos::deep_copy(norms_host, norms); Kokkos::deep_copy(norms_ref_host, norms_ref); /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); + typename ats::mag_type eps = 100 * std::numeric_limits::epsilon(); for (int k = 0; k < N; ++k) for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) EXPECT_NEAR_KK(norms_host(k, vecIdx), norms_ref_host(k, vecIdx), eps); @@ -232,8 +212,7 @@ int test_blas_serial_nrm2() { Test::impl_test_blas_serial_nrm2(1024, 9); Test::impl_test_blas_serial_nrm2(132231, 3); - using MVViewType = - Kokkos::View; + using MVViewType = Kokkos::View; Test::impl_test_blas_serial_nrm2mv(0, 10, 5); Test::impl_test_blas_serial_nrm2mv(10, 15, 7); Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); @@ -242,15 +221,13 @@ int test_blas_serial_nrm2() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) { - using ViewType = - Kokkos::View; + using ViewType = Kokkos::View; Test::impl_test_blas_serial_nrm2(0, 10); Test::impl_test_blas_serial_nrm2(10, 15); Test::impl_test_blas_serial_nrm2(1024, 9); Test::impl_test_blas_serial_nrm2(132231, 3); - using MVViewType = - Kokkos::View; + using MVViewType = Kokkos::View; Test::impl_test_blas_serial_nrm2mv(0, 10, 5); Test::impl_test_blas_serial_nrm2mv(10, 15, 5); Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); @@ -262,27 +239,19 @@ int test_blas_serial_nrm2() { } #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, serial_nrm2_float_float) { - test_blas_serial_nrm2(); -} +TEST_F(TestCategory, serial_nrm2_float_float) { test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, serial_nrm2_double_double) { - test_blas_serial_nrm2(); -} +TEST_F(TestCategory, serial_nrm2_double_double) { test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) -TEST_F(TestCategory, serial_nrm2_fcomplex_float) { - test_blas_serial_nrm2 >(); -} +TEST_F(TestCategory, serial_nrm2_fcomplex_float) { test_blas_serial_nrm2 >(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) -TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { - test_blas_serial_nrm2 >(); -} +TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { test_blas_serial_nrm2 >(); } #endif #endif // TEST_BLAS_SERIAL_NRM2_HPP_ diff --git a/common/impl/KokkosKernels_Iota.hpp b/common/impl/KokkosKernels_Iota.hpp index 04851e81c9..770a0201ef 100644 --- a/common/impl/KokkosKernels_Iota.hpp +++ b/common/impl/KokkosKernels_Iota.hpp @@ -67,8 +67,7 @@ class Iota { Constructing with size < 0 yeilds a 0-size Iota */ KOKKOS_INLINE_FUNCTION - constexpr Iota(const size_type &size, const value_type offset) - : size_(size), offset_(offset) { + constexpr Iota(const size_type &size, const value_type offset) : size_(size), offset_(offset) { if constexpr (std::is_signed_v) { if (size_ < size_type(0)) { size_ = 0; @@ -102,8 +101,7 @@ class Iota { Creating a subview outside of the base Iota yeilds undefined behavior */ template - KOKKOS_INLINE_FUNCTION constexpr Iota(const Iota &base, - const Kokkos::pair &range) + KOKKOS_INLINE_FUNCTION constexpr Iota(const Iota &base, const Kokkos::pair &range) : Iota(range.second - range.first, base.offset_ + range.first) {} /*! \brief Construct Iota subview @@ -111,9 +109,7 @@ class Iota { i >= size() or i < 0 yields undefined behavior. */ KOKKOS_INLINE_FUNCTION - constexpr T operator()(size_type i) const noexcept { - return value_type(i + offset_); - }; + constexpr T operator()(size_type i) const noexcept { return value_type(i + offset_); }; /// \brief return the size of the iota KOKKOS_INLINE_FUNCTION diff --git a/common/impl/KokkosKernels_NaN.hpp b/common/impl/KokkosKernels_NaN.hpp index f319539a9f..75d6a3ac8c 100644 --- a/common/impl/KokkosKernels_NaN.hpp +++ b/common/impl/KokkosKernels_NaN.hpp @@ -26,10 +26,9 @@ namespace KokkosKernels::Impl { template KOKKOS_INLINE_FUNCTION T quiet_NaN() { if constexpr (std::is_same_v) { - return double(Kokkos::Experimental::quiet_NaN_v< - float>); // Kokkos::Experimetnal::quiet_NaN_v - // is undefined in - // device code + return double(Kokkos::Experimental::quiet_NaN_v); // Kokkos::Experimetnal::quiet_NaN_v + // is undefined in + // device code } else if constexpr (Kokkos::ArithTraits::is_complex) { using value_type = typename T::value_type; return T(quiet_NaN(), diff --git a/common/impl/KokkosKernels_SafeCompare.hpp b/common/impl/KokkosKernels_SafeCompare.hpp index 494ef45ada..1bd43c046a 100644 --- a/common/impl/KokkosKernels_SafeCompare.hpp +++ b/common/impl/KokkosKernels_SafeCompare.hpp @@ -47,8 +47,7 @@ KOKKOS_INLINE_FUNCTION constexpr bool safe_gt(const T &t, const U &u) { using KU = Kokkos::ArithTraits; // both are integer, but only one is signed - if constexpr (KT::is_integer && KU::is_integer && - (KT::is_signed != KU::is_signed)) { + if constexpr (KT::is_integer && KU::is_integer && (KT::is_signed != KU::is_signed)) { // how wide the signed type would need to be to hold T and U constexpr size_t t_width = KT::is_signed ? sizeof(T) : 2 * sizeof(T); constexpr size_t u_width = KU::is_signed ? sizeof(U) : 2 * sizeof(U); diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp index 2ae8fb609d..4769f1744a 100644 --- a/common/impl/KokkosKernels_ViewUtils.hpp +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -29,13 +29,11 @@ class with_unmanaged { using layout_type = typename View::array_layout; using memory_space = typename View::memory_space; - using orig_traits = typename View::memory_traits; - static constexpr unsigned new_traits = - orig_traits::impl_value | Kokkos::Unmanaged; + using orig_traits = typename View::memory_traits; + static constexpr unsigned new_traits = orig_traits::impl_value | Kokkos::Unmanaged; public: - using type = Kokkos::View >; + using type = Kokkos::View >; }; /*! \brief A type that is View with Kokkos::Unmanaged added to the memory traits diff --git a/common/src/KokkosKernels_BitUtils.hpp b/common/src/KokkosKernels_BitUtils.hpp index 5be56c388c..9dcf8a38ae 100644 --- a/common/src/KokkosKernels_BitUtils.hpp +++ b/common/src/KokkosKernels_BitUtils.hpp @@ -222,8 +222,7 @@ int least_set_bit( long long i ){ } */ -#elif defined(__INTEL_COMPILER) || defined(KOKKOS_COMPILER_IBM) || \ - defined(__GNUC__) || defined(__GNUG__) +#elif defined(__INTEL_COMPILER) || defined(KOKKOS_COMPILER_IBM) || defined(__GNUC__) || defined(__GNUG__) KOKKOS_FORCEINLINE_FUNCTION int least_set_bit(unsigned i) { return __builtin_ffs(i); } KOKKOS_FORCEINLINE_FUNCTION diff --git a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp index 3ca160164c..2b64c38ce9 100644 --- a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp @@ -20,14 +20,13 @@ #include "KokkosKernels_BlockUtils.hpp" #include "KokkosKernels_HashmapAccumulator.hpp" -//#define HASHMAPACCUMULATOR_ASSERT_ENABLED +// #define HASHMAPACCUMULATOR_ASSERT_ENABLED namespace KokkosKernels { namespace Experimental { -template +template /** * \brief BlockHashmapAccumulator class * The use of this is described in the paper: @@ -89,13 +88,7 @@ struct BlockHashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - BlockHashmapAccumulator() - : hash_begins(), - hash_nexts(), - keys(), - values(), - __max_value_size(), - __hashOpRHS(0) {} + BlockHashmapAccumulator() : hash_begins(), hash_nexts(), keys(), values(), __max_value_size(), __hashOpRHS(0) {} /** * \brief parameterized constructor BlockHashmapAccumulator @@ -113,10 +106,8 @@ struct BlockHashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - BlockHashmapAccumulator(size_type block_dim_, const size_type max_value_size_, - const size_type hashOpRHS, size_type *hash_begins_, - size_type *hash_nexts_, key_type *keys_, - value_type *values_) + BlockHashmapAccumulator(size_type block_dim_, const size_type max_value_size_, const size_type hashOpRHS, + size_type *hash_begins_, size_type *hash_nexts_, key_type *keys_, value_type *values_) : hash_begins(hash_begins_), hash_nexts(hash_nexts_), keys(keys_), @@ -136,10 +127,9 @@ struct BlockHashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - void sequential_insert_into_hash_mergeAdd_TrackHashes( - key_type key, const value_type *valueA, const value_type *valueB, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + void sequential_insert_into_hash_mergeAdd_TrackHashes(key_type key, const value_type *valueA, + const value_type *valueB, size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return; @@ -149,8 +139,7 @@ struct BlockHashmapAccumulator { hash = __compute_hash(key, __hashOpRHS); for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { - KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, - valueA, valueB); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, valueA, valueB); return; } } @@ -164,8 +153,7 @@ struct BlockHashmapAccumulator { hash_begins[hash] = my_index; keys[my_index] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + my_index * block_size, valueA, valueB); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + my_index * block_size, valueA, valueB); } // Performs C[hash] += A * B (for existing entry) @@ -173,37 +161,28 @@ struct BlockHashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - void sequential_insert_into_hash_simple(key_type key, const value_type *a_val, - const value_type *b_val, - size_type &used_size, - size_type *used_hashes) { - for (size_type hash = (key * HASHSCALAR) & __hashOpRHS;; - hash = (hash + 1) & __hashOpRHS) { + void sequential_insert_into_hash_simple(key_type key, const value_type *a_val, const value_type *b_val, + size_type &used_size, size_type *used_hashes) { + for (size_type hash = (key * HASHSCALAR) & __hashOpRHS;; hash = (hash + 1) & __hashOpRHS) { if (keys[hash] == -1) { used_hashes[used_size++] = hash; keys[hash] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + hash * block_size, a_val, b_val); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + hash * block_size, a_val, b_val); break; } else if (keys[hash] == key) { - KokkosSparse::Impl::kk_block_add_mul( - block_dim, values + hash * block_size, a_val, b_val); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + hash * block_size, a_val, b_val); break; } } } KOKKOS_INLINE_FUNCTION - void sequential_export_values_simple(const size_type used_size, - const size_type *used_hashes, - key_type *out_keys, - value_type *out_values, - const bool clear = true) { + void sequential_export_values_simple(const size_type used_size, const size_type *used_hashes, key_type *out_keys, + value_type *out_values, const bool clear = true) { for (size_type i = 0; i < used_size; ++i) { const auto hash = used_hashes[i]; out_keys[i] = keys[hash]; - KokkosSparse::Impl::kk_block_set(block_dim, out_values + i * block_size, - values + hash * block_size); + KokkosSparse::Impl::kk_block_set(block_dim, out_values + i * block_size, values + hash * block_size); if (clear) { keys[hash] = -1; } @@ -218,10 +197,9 @@ struct BlockHashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd_TrackHashes( - const key_type key, const value_type *valA, const value_type *valB, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes(const key_type key, const value_type *valA, + const value_type *valB, volatile size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -232,8 +210,7 @@ struct BlockHashmapAccumulator { for (; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { - KokkosSparse::Impl::kk_block_add_mul( - block_dim, values + i * block_size, valA, valB); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, valA, valB); return __insert_success; } } @@ -247,8 +224,7 @@ struct BlockHashmapAccumulator { return __insert_full; } else { keys[my_write_index] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + my_write_index * block_size, valA, valB); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + my_write_index * block_size, valA, valB); #ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS // this is an issue on VOLTA+ and up because warps do not go in SIMD @@ -276,11 +252,9 @@ struct BlockHashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -288,12 +262,9 @@ struct BlockHashmapAccumulator { } template - KOKKOS_INLINE_FUNCTION int - vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - const team_member_t & /* teamMember */, const int /* vector_size */, - size_type hash, const key_type key, const value_type *valA, - const value_type *valB, volatile size_type *used_size_, - const size_type max_value_size_) { + KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + const team_member_t & /* teamMember */, const int /* vector_size */, size_type hash, const key_type key, + const value_type *valA, const value_type *valB, volatile size_type *used_size_, const size_type max_value_size_) { // Cannot compute hash here due to impl_speed use-case // hash = __compute_hash(key, __hashOpRHS); if (key == -1) return __insert_success; @@ -302,8 +273,7 @@ struct BlockHashmapAccumulator { size_type i = hash_begins[hash]; for (; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { - KokkosSparse::Impl::kk_block_add_mul( - block_dim, values + i * block_size, valA, valB); + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, valA, valB); return __insert_success; } } @@ -316,15 +286,13 @@ struct BlockHashmapAccumulator { if (used_size_[0] >= max_value_size_) { return __insert_full; } - size_type my_write_index = - Kokkos::atomic_fetch_add(used_size_, size_type(1)); + size_type my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); if (my_write_index >= max_value_size_) { return __insert_full; } else { keys[my_write_index] = key; - KokkosSparse::Impl::kk_block_set_mul( - block_dim, values + my_write_index * block_size, valA, valB); + KokkosSparse::Impl::kk_block_set_mul(block_dim, values + my_write_index * block_size, valA, valB); #ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS // this is an issue on VOLTA+ and up because warps do not go in SIMD @@ -356,8 +324,7 @@ struct BlockHashmapAccumulator { // hashbeginning = hash_begins[hash] // hash_begins[hash] = my_write_index // hash_nexts[my_write_index] = hash_begins[hash] - size_type hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + size_type hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -371,15 +338,12 @@ struct BlockHashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd(const key_type key, - const value_type *valA, - const value_type *valB, + int vector_atomic_insert_into_hash_mergeAdd(const key_type key, const value_type *valA, const value_type *valB, volatile size_type *used_size_) { if (key == -1) return __insert_success; return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - nullptr, 0, __compute_hash(key, __hashOpRHS), key, valA, valB, - used_size_, __max_value_size); + nullptr, 0, __compute_hash(key, __hashOpRHS), key, valA, valB, used_size_, __max_value_size); } #if 0 @@ -592,11 +556,9 @@ struct BlockHashmapAccumulator { static constexpr int __insert_success = 0; static constexpr int __insert_full = 1; - template ::value || - std::is_same::value, - std::size_t>::type = 0> + template ::value || + std::is_same::value, + std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) { size_type hash = key & bitmask; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED @@ -606,9 +568,8 @@ struct BlockHashmapAccumulator { return hash; } - template ::value, - std::size_t>::type = 0> + template ::value, std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) { size_type hash = key % divisor; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED diff --git a/common/src/KokkosKernels_BlockUtils.hpp b/common/src/KokkosKernels_BlockUtils.hpp index 6fd9d9b656..64309372ac 100644 --- a/common/src/KokkosKernels_BlockUtils.hpp +++ b/common/src/KokkosKernels_BlockUtils.hpp @@ -25,10 +25,9 @@ namespace Impl { // Initializes block: A = [val, val, val, ....] template -KOKKOS_INLINE_FUNCTION void kk_block_init( - const size_type block_dim, value_type *dst, - const value_type val = static_cast( - 0)) { // Note: replaces __host__ std::fill() not to be called from GPU +KOKKOS_INLINE_FUNCTION void kk_block_init(const size_type block_dim, value_type *dst, + const value_type val = static_cast( + 0)) { // Note: replaces __host__ std::fill() not to be called from GPU for (auto end = dst + (block_dim * block_dim); dst < end; ++dst) { *dst = val; } @@ -36,17 +35,13 @@ KOKKOS_INLINE_FUNCTION void kk_block_init( // Initializes block: A = B template -KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, - value_type *dst, - const value_type *val) { +KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, value_type *dst, const value_type *val) { memcpy((void *)dst, val, block_dim * block_dim * sizeof(value_type)); } // Performs A += B on blocks template -KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, - value_type *dst, - const value_type *val) { +KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, value_type *dst, const value_type *val) { const auto end = dst + block_dim * block_dim; while (dst < end) { *(dst++) += *(val++); @@ -57,33 +52,25 @@ KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, // Note: block is assumed to be row-major, dense matrix (no extra padding) // Note: set clear=true to set C = 0 before increment template > -KOKKOS_INLINE_FUNCTION void kk_block_dgemm(const size_type block_dim, - value_type *dst, - const value_type *valA, - const value_type *valB, - const bool clear = false) { + typename DGEMM = KokkosBatched::SerialGemmInternal> +KOKKOS_INLINE_FUNCTION void kk_block_dgemm(const size_type block_dim, value_type *dst, const value_type *valA, + const value_type *valB, const bool clear = false) { const auto ZERO = static_cast(0); const auto ONE = static_cast(1); - DGEMM::invoke(block_dim, block_dim, block_dim, ONE, valA, block_dim, 1, valB, - block_dim, 1, clear ? ZERO : ONE, dst, block_dim, 1); + DGEMM::invoke(block_dim, block_dim, block_dim, ONE, valA, block_dim, 1, valB, block_dim, 1, clear ? ZERO : ONE, dst, + block_dim, 1); } // dgemm: C = A * B template -KOKKOS_INLINE_FUNCTION void kk_block_set_mul(const size_type block_dim, - value_type *c_val, - const value_type *a_val, +KOKKOS_INLINE_FUNCTION void kk_block_set_mul(const size_type block_dim, value_type *c_val, const value_type *a_val, const value_type *b_val) { kk_block_dgemm(block_dim, c_val, a_val, b_val, true); } // dgemm: C += A * B template -KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, - value_type *c_val, - const value_type *a_val, +KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, value_type *c_val, const value_type *a_val, const value_type *b_val) { kk_block_dgemm(block_dim, c_val, a_val, b_val, false); } @@ -91,9 +78,7 @@ KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, // Performs C += A * B (dense GEMM) on blocks // Note: all pointers reference dense row-major blocks (no extra padding) template -KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, - value_type *dst, - const value_type *valA, +KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, value_type *dst, const value_type *valA, const value_type *valB) { // NOTE: this should be replaced by batched DGEMM // once atomic increment is supported there @@ -102,8 +87,7 @@ KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, for (size_type col = 0; col < block_dim; ++col) { auto v = &dst[row_offset + col]; auto vb = valB + col; - for (const value_type *va = valA + row_offset, *end = va + block_dim; - va < end; ++va) { + for (const value_type *va = valA + row_offset, *end = va + block_dim; va < end; ++va) { Kokkos::atomic_add(v, (*va) * (*vb)); vb += block_dim; } diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 83f2c23ff2..05ce523ecf 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -23,32 +23,25 @@ namespace KokkosKernels { namespace Impl { -inline void throw_runtime_exception(const std::string &msg) { - throw std::runtime_error(msg); -} +inline void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } #if defined(KOKKOS_ENABLE_HIP) -inline void hip_internal_error_throw(hipError_t e, const char *name, - const char *file, const int line) { +inline void hip_internal_error_throw(hipError_t e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " << hipGetErrorName(e) - << "): " << hipGetErrorString(e); + out << name << " error( " << hipGetErrorName(e) << "): " << hipGetErrorString(e); if (file) { out << " " << file << ":" << line; } throw_runtime_exception(out.str()); } -inline void hip_internal_safe_call(hipError_t e, const char *name, - const char *file = nullptr, - const int line = 0) { +inline void hip_internal_safe_call(hipError_t e, const char *name, const char *file = nullptr, const int line = 0) { if (hipSuccess != e) { hip_internal_error_throw(e, name, file, line); } } -#define KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(call) \ - hip_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(call) hip_internal_safe_call(call, #call, __FILE__, __LINE__) #endif } // namespace Impl @@ -90,8 +83,7 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, #ifndef NDEBUG #define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) -#define KK_ASSERT_MSG(condition, msg) \ - IMPL_THROW(condition, msg, std::logic_error) +#define KK_ASSERT_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) #define KK_KERNEL_ASSERT(condition) IMPL_KERNEL_THROW(condition, "") #define KK_KERNEL_ASSERT_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) #else @@ -102,12 +94,10 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, #endif #define KK_REQUIRE(condition) IMPL_THROW(condition, "", std::logic_error) -#define KK_REQUIRE_MSG(condition, msg) \ - IMPL_THROW(condition, msg, std::logic_error) +#define KK_REQUIRE_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) #define KK_USER_REQUIRE(condition) IMPL_THROW(condition, "", std::runtime_error) -#define KK_USER_REQUIRE_MSG(condition, msg) \ - IMPL_THROW(condition, msg, std::runtime_error) +#define KK_USER_REQUIRE_MSG(condition, msg) IMPL_THROW(condition, msg, std::runtime_error) #define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") #define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index 4d3a3002b4..2d167f5c73 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -29,14 +29,7 @@ namespace KokkosKernels { namespace Impl { -enum ExecSpaceType { - Exec_SERIAL, - Exec_OMP, - Exec_THREADS, - Exec_CUDA, - Exec_HIP, - Exec_SYCL -}; +enum ExecSpaceType { Exec_SERIAL, Exec_OMP, Exec_THREADS, Exec_CUDA, Exec_HIP, Exec_SYCL }; template KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { @@ -105,8 +98,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { #ifdef KOKKOS_ENABLE_SYCL template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_gpu_exec_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { return true; } #endif @@ -122,8 +114,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_x86_64_mem_space() { #if __x86_64__ template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_x86_64_mem_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_x86_64_mem_space() { return true; } #endif // x86_64 architectures @@ -139,8 +130,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_a64fx_mem_space() { #if defined(__ARM_ARCH_ISA_A64) template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_a64fx_mem_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_a64fx_mem_space() { return true; } #endif // a64fx architectures @@ -148,86 +138,67 @@ kk_is_a64fx_mem_space() { // Host function to determine free and total device memory. // Will throw if execution space doesn't support this. template -inline void kk_get_free_total_memory(size_t& /* free_mem */, - size_t& /* total_mem */) { +inline void kk_get_free_total_memory(size_t& /* free_mem */, size_t& /* total_mem */) { std::ostringstream oss; - oss << "Error: memory space " << MemorySpace::name() - << " does not support querying free/total memory."; + oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; throw std::runtime_error(oss.str()); } // Host function to determine free and total device memory. // Will throw if execution space doesn't support this. template -inline void kk_get_free_total_memory(size_t& /* free_mem */, - size_t& /* total_mem */, - int /* n_streams */) { +inline void kk_get_free_total_memory(size_t& /* free_mem */, size_t& /* total_mem */, int /* n_streams */) { std::ostringstream oss; - oss << "Error: memory space " << MemorySpace::name() - << " does not support querying free/total memory."; + oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; throw std::runtime_error(oss.str()); } #ifdef KOKKOS_ENABLE_CUDA template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { cudaMemGetInfo(&free_mem, &total_mem); free_mem /= n_streams; total_mem /= n_streams; } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } #endif #ifdef KOKKOS_ENABLE_HIP template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); free_mem /= n_streams; total_mem /= n_streams; } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem, - int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } #endif @@ -236,12 +207,11 @@ inline void kk_get_free_total_memory( // available. Also, we assume to query memory associated with the default queue. #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, + int n_streams) { sycl::queue queue; - sycl::device device = queue.get_device(); - auto level_zero_handle = - sycl::get_native(device); + sycl::device device = queue.get_device(); + auto level_zero_handle = sycl::get_native(device); uint32_t n_memory_modules = 0; zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules, nullptr); @@ -255,8 +225,7 @@ inline void kk_get_free_total_memory( total_mem = 0; free_mem = 0; std::vector mem_handles(n_memory_modules); - zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules, - mem_handles.data()); + zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules, mem_handles.data()); for (auto& mem_handle : mem_handles) { zes_mem_properties_t memory_properties{ZES_STRUCTURE_TYPE_MEM_PROPERTIES}; @@ -274,38 +243,30 @@ inline void kk_get_free_total_memory( } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory( - free_mem, total_mem, 1); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { - kk_get_free_total_memory( - free_mem, total_mem, n_streams); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory( - free_mem, total_mem, 1); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams) { - kk_get_free_total_memory( - free_mem, total_mem, n_streams); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory( - free_mem, total_mem, 1); +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif @@ -325,8 +286,7 @@ inline int kk_get_max_vector_size() { } #endif -inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz, - const ExecSpaceType exec_space) { +inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz, const ExecSpaceType exec_space) { int suggested_vector_size_ = 1; int max_vector_size = 1; switch (exec_space) { @@ -360,17 +320,14 @@ inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz, } else { suggested_vector_size_ = 64; } - if (suggested_vector_size_ > max_vector_size) - suggested_vector_size_ = max_vector_size; + if (suggested_vector_size_ > max_vector_size) suggested_vector_size_ = max_vector_size; break; } return suggested_vector_size_; } -inline int kk_get_suggested_team_size(const int vector_size, - const ExecSpaceType exec_space) { - if (exec_space == Exec_CUDA || exec_space == Exec_HIP || - exec_space == Exec_SYCL) { +inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space) { + if (exec_space == Exec_CUDA || exec_space == Exec_HIP || exec_space == Exec_SYCL) { // TODO: where this is used, tune the target value for // threads per block (but 256 is probably OK for CUDA and HIP) return 256 / vector_size; diff --git a/common/src/KokkosKernels_HashmapAccumulator.hpp b/common/src/KokkosKernels_HashmapAccumulator.hpp index 1085cec4af..c57dfa83fd 100644 --- a/common/src/KokkosKernels_HashmapAccumulator.hpp +++ b/common/src/KokkosKernels_HashmapAccumulator.hpp @@ -36,8 +36,7 @@ struct HashOpType { struct pow2Modulo {}; }; -template +template /** * \brief HashmapAccumulator class * The use of this is described in the paper: @@ -96,13 +95,7 @@ struct HashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - HashmapAccumulator() - : hash_begins(), - hash_nexts(), - keys(), - values(), - __max_value_size(), - __hashOpRHS(0) {} + HashmapAccumulator() : hash_begins(), hash_nexts(), keys(), values(), __max_value_size(), __hashOpRHS(0) {} /** * \brief parameterized constructor HashmapAccumulator @@ -120,9 +113,8 @@ struct HashmapAccumulator { * Assumption: hash_begins_ are all initialized to -1. */ KOKKOS_INLINE_FUNCTION - HashmapAccumulator(const size_type max_value_size_, const size_type hashOpRHS, - size_type *hash_begins_, size_type *hash_nexts_, - key_type *keys_, value_type *values_) + HashmapAccumulator(const size_type max_value_size_, const size_type hashOpRHS, size_type *hash_begins_, + size_type *hash_nexts_, key_type *keys_, value_type *values_) : hash_begins(hash_begins_), hash_nexts(hash_nexts_), keys(keys_), @@ -139,11 +131,8 @@ struct HashmapAccumulator { // Accumulation is OR operation. // Insertion is sequential, no race condition for the insertion. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeOr_TrackHashes(key_type key, - value_type value, - size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) { + int sequential_insert_into_hash_mergeOr_TrackHashes(key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return __insert_success; @@ -175,10 +164,9 @@ struct HashmapAccumulator { // TODO: This function is for triangle counting. // Assume that there are 2 values for triangle count. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes(key_type key, value_type value, value_type *values2, + size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return __insert_success; @@ -210,10 +198,10 @@ struct HashmapAccumulator { // this is used in slow triangle counting method. // L x Incidence KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type * /*used_size_*/, size_type * /*used_hash_size*/, - size_type * /*used_hashes*/) { + int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(key_type key, value_type value, + value_type *values2, size_type * /*used_size_*/, + size_type * /*used_hash_size*/, + size_type * /*used_hashes*/) { size_type hash, i; if (key == -1) return __insert_success; @@ -234,8 +222,7 @@ struct HashmapAccumulator { // this is used in LxL or Incidence^T x L KOKKOS_INLINE_FUNCTION - value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( - key_type key, value_type value) { + value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(key_type key, value_type value) { size_type hash, i; if (key == -1) return __insert_success; @@ -254,10 +241,9 @@ struct HashmapAccumulator { // this is used in slow triangle counting method. // L x Incidence KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int sequential_insert_into_hash_TriangleCount_TrackHashes(key_type key, value_type value, value_type *values2, + size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, my_index; if (key == -1) return __insert_success; @@ -283,11 +269,10 @@ struct HashmapAccumulator { // this is used in LxL or Incidence^T x L KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TriangleCount_TrackHashes( - key_type key, value_type value, size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) // issue-508, TODO figure out what this - // "used_hashes" is for + int sequential_insert_into_hash_TriangleCount_TrackHashes(key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, + size_type *used_hashes) // issue-508, TODO figure out what + // this "used_hashes" is for { size_type hash, my_index; @@ -315,9 +300,8 @@ struct HashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - void sequential_insert_into_hash_mergeAdd_TrackHashes( - key_type key, value_type value, size_type *used_size_, - size_type *used_hash_size, size_type *used_hashes) { + void sequential_insert_into_hash_mergeAdd_TrackHashes(key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; if (key == -1) return; @@ -348,9 +332,7 @@ struct HashmapAccumulator { // used in the compression to count the sets. // also used in the symbolic of spgemm if no compression is applied. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TrackHashes(key_type key, - size_type *used_size_, - size_type *used_hash_size, + int sequential_insert_into_hash_TrackHashes(key_type key, size_type *used_size_, size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; @@ -383,10 +365,9 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd_TrackHashes( - const key_type key, const value_type value, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes(const key_type key, const value_type value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -438,11 +419,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -453,10 +432,9 @@ struct HashmapAccumulator { // except uses atomic addition on updating the value // necessary if duplicate key insertions happen simultaneously KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( - const key_type key, const value_type value, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes(const key_type key, const value_type value, + volatile size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -509,11 +487,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -521,9 +497,8 @@ struct HashmapAccumulator { } KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd_TrackHashes_no_list( - const key_type key, const value_type value, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes_no_list(const key_type key, const value_type value, + size_type *used_hash_size, size_type *used_hashes) { size_type hash; if (key == -1) return __insert_success; @@ -541,11 +516,9 @@ struct HashmapAccumulator { Kokkos::atomic_add(values + hash, value); return __insert_success; } else if (keys[hash] == -1) { - if (Kokkos::atomic_compare_exchange_strong(keys + hash, -1, - key)) { + if (Kokkos::atomic_compare_exchange_strong(keys + hash, -1, key)) { // should only be here if we used a new hash - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, - size_type(1))] = hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; Kokkos::atomic_add(values + hash, value); return __insert_success; } @@ -565,11 +538,9 @@ struct HashmapAccumulator { // NOTE: this is an exact copy of vector_atmoic_insert_into_hash_mergeAdd from // https://github.com/kokkos/kokkos-kernels/blob/750fe24508a69ed4dba92bb4a9e17a6094b1a083/src/common/KokkosKernels_HashmapAccumulator.hpp#L442-L502 template - KOKKOS_INLINE_FUNCTION int - vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - const team_member_t & /* teamMember */, const int /* vector_size */, - size_type hash, const key_type key, const value_type value, - volatile size_type *used_size_, const size_type max_value_size_) { + KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + const team_member_t & /* teamMember */, const int /* vector_size */, size_type hash, const key_type key, + const value_type value, volatile size_type *used_size_, const size_type max_value_size_) { // Cannot compute hash here due to impl_speed use-case // hash = __compute_hash(key, __hashOpRHS); if (key == -1) return __insert_success; @@ -591,8 +562,7 @@ struct HashmapAccumulator { if (used_size_[0] >= max_value_size_) { return __insert_full; } - size_type my_write_index = - Kokkos::atomic_fetch_add(used_size_, size_type(1)); + size_type my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); if (my_write_index >= max_value_size_) { return __insert_full; @@ -630,8 +600,7 @@ struct HashmapAccumulator { // hashbeginning = hash_begins[hash] // hash_begins[hash] = my_write_index // hash_nexts[my_write_index] = hash_begins[hash] - size_type hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + size_type hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -645,20 +614,17 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeAdd(const key_type key, - const value_type value, + int vector_atomic_insert_into_hash_mergeAdd(const key_type key, const value_type value, volatile size_type *used_size_) { if (key == -1) return __insert_success; return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - nullptr, 0, __compute_hash(key, __hashOpRHS), key, value, used_size_, - __max_value_size); + nullptr, 0, __compute_hash(key, __hashOpRHS), key, value, used_size_, __max_value_size); } // used in symbolic of kkmem if the compression is not applied. KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash(const key_type &key, - volatile size_type *used_size_) { + int vector_atomic_insert_into_hash(const key_type &key, volatile size_type *used_size_) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -692,8 +658,7 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -706,8 +671,7 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeOr(const key_type &key, - const value_type &value, + int vector_atomic_insert_into_hash_mergeOr(const key_type &key, const value_type &value, volatile size_type *used_size_) { size_type hash, i, my_write_index, hashbeginning; @@ -744,8 +708,7 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); hash_nexts[my_write_index] = hashbeginning; return __insert_success; } @@ -758,10 +721,9 @@ struct HashmapAccumulator { // Insertion is simulteanous for the vector lanes of a thread. // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_mergeOr_TrackHashes( - const key_type &key, const value_type &value, - volatile size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_mergeOr_TrackHashes(const key_type &key, const value_type &value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -797,11 +759,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -809,10 +769,8 @@ struct HashmapAccumulator { } KOKKOS_INLINE_FUNCTION - int vector_atomic_insert_into_hash_TrackHashes(const key_type &key, - volatile size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) { + int vector_atomic_insert_into_hash_TrackHashes(const key_type &key, volatile size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_write_index, hashbeginning; if (key == -1) return __insert_success; @@ -846,11 +804,9 @@ struct HashmapAccumulator { hash_nexts[my_write_index] = hash_begins[hash]; #endif - hashbeginning = - Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hashbeginning = Kokkos::atomic_exchange(hash_begins + hash, my_write_index); if (hashbeginning == -1) { - used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = - hash; + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; } hash_nexts[my_write_index] = hashbeginning; return __insert_success; @@ -863,11 +819,9 @@ struct HashmapAccumulator { static constexpr int __insert_success = 0; static constexpr int __insert_full = 1; - template ::value || - std::is_same::value, - std::size_t>::type = 0> + template ::value || + std::is_same::value, + std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) { size_type hash = key & bitmask; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED @@ -877,9 +831,8 @@ struct HashmapAccumulator { return hash; } - template ::value, - std::size_t>::type = 0> + template ::value, std::size_t>::type = 0> KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) { size_type hash = key % divisor; #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED diff --git a/common/src/KokkosKernels_IOUtils.hpp b/common/src/KokkosKernels_IOUtils.hpp index fd3e44db09..eb44082a74 100644 --- a/common/src/KokkosKernels_IOUtils.hpp +++ b/common/src/KokkosKernels_IOUtils.hpp @@ -47,15 +47,13 @@ inline void getRandomBounds(double mag, Scalar &start, Scalar &end) { } template <> -inline void getRandomBounds(double mag, Kokkos::complex &start, - Kokkos::complex &end) { +inline void getRandomBounds(double mag, Kokkos::complex &start, Kokkos::complex &end) { start = Kokkos::complex(-mag, -mag); end = Kokkos::complex(mag, mag); } template <> -inline void getRandomBounds(double mag, Kokkos::complex &start, - Kokkos::complex &end) { +inline void getRandomBounds(double mag, Kokkos::complex &start, Kokkos::complex &end) { start = Kokkos::complex(-mag, -mag); end = Kokkos::complex(mag, mag); } @@ -98,9 +96,7 @@ inline size_t kk_get_file_size(const char *file) { } template -void buildEdgeListFromBinSrcTarg_undirected(const char *fnameSrc, - const char *fnameTarg, - size_t &numEdges, lno_t **srcs, +void buildEdgeListFromBinSrcTarg_undirected(const char *fnameSrc, const char *fnameTarg, size_t &numEdges, lno_t **srcs, lno_t **dst) { size_t srcFileSize = kk_get_file_size(fnameSrc); size_t trgFileSize = kk_get_file_size(fnameTarg); @@ -150,8 +146,7 @@ inline void kk_write_1Dview_to_file(idx_array_type view, const char *filename) { } template -inline void kk_read_1Dview_from_file(idx_array_type &view, - const char *filename) { +inline void kk_read_1Dview_from_file(idx_array_type &view, const char *filename) { typedef typename idx_array_type::HostMirror host_type; // typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -183,8 +178,7 @@ inline void kk_write_2Dview_to_file(idx_array_type view, const char *filename) { } template -inline void kk_read_2Dview_from_file(idx_array_type &view, - const char *filename) { +inline void kk_read_2Dview_from_file(idx_array_type &view, const char *filename) { typedef typename idx_array_type::HostMirror host_type; // typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -221,8 +215,7 @@ inline void kk_write_3Dview_to_file(idx_array_type view, const char *filename) { } template -inline void kk_read_3Dview_from_file(idx_array_type &view, - const char *filename) { +inline void kk_read_3Dview_from_file(idx_array_type &view, const char *filename) { typedef typename idx_array_type::HostMirror host_type; // typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -241,8 +234,7 @@ inline void kk_read_3Dview_from_file(idx_array_type &view, } template -[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins, - const idx *edge_ends, const wt *ew, +[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends, const wt *ew, const char *filename) { std::ofstream myFile(filename, std::ios::out | std::ios::binary); myFile.write((char *)&ne, sizeof(idx)); @@ -253,8 +245,7 @@ template } template -void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, - const char *filename) { +void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, const char *filename) { std::ifstream myFile(filename, std::ios::in | std::ios::binary); myFile.read((char *)ne, sizeof(idx)); @@ -269,8 +260,7 @@ void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, inline bool endswith(std::string const &fullString, std::string const &ending) { if (fullString.length() >= ending.length()) { - return (0 == fullString.compare(fullString.length() - ending.length(), - ending.length(), ending)); + return (0 == fullString.compare(fullString.length() - ending.length(), ending.length(), ending)); } else { return false; } diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp index e091932453..f7a5ccef96 100644 --- a/common/src/KokkosKernels_LowerBound.hpp +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -87,15 +87,11 @@ namespace Impl { At most view.size() predicate function calls */ -template > -KOKKOS_INLINE_FUNCTION typename ViewLike::size_type -lower_bound_sequential_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { using size_type = typename ViewLike::size_type; - static_assert(1 == ViewLike::rank, - "lower_bound_sequential_thread requires rank-1 views"); + static_assert(1 == ViewLike::rank, "lower_bound_sequential_thread requires rank-1 views"); size_type i = 0; while (i < view.size() && pred(view(i), value)) { @@ -116,14 +112,11 @@ lower_bound_sequential_thread( At most log2(view.size()) + 1 predicate function calls */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { using size_type = typename ViewLike::size_type; - static_assert(1 == ViewLike::rank, - "lower_bound_binary_thread requires rank-1 views"); + static_assert(1 == ViewLike::rank, "lower_bound_binary_thread requires rank-1 views"); size_type lo = 0; size_type hi = view.size(); @@ -155,13 +148,10 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( This minimizes the calls to predicate: for view.size() >= 8, this does a binary search, otherwise, a linear search */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { - static_assert(1 == ViewLike::rank, - "lower_bound_thread requires rank-1 views"); + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, "lower_bound_thread requires rank-1 views"); /* sequential search makes on average 0.5 * view.size memory accesses binary search makes log2(view.size)+1 accesses @@ -196,18 +186,14 @@ namespace Impl { Uses a single thread to call \c lower_bound_thread, and broadcasts that to all team members. */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_single_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { typename ViewLike::size_type idx; Kokkos::single( Kokkos::PerTeam(handle), - [&](typename ViewLike::size_type &lidx) { - lidx = KokkosKernels::lower_bound_thread(view, value, pred); - }, - idx); + [&](typename ViewLike::size_type &lidx) { lidx = KokkosKernels::lower_bound_thread(view, value, pred); }, idx); return idx; } @@ -229,16 +215,12 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_single_team( Apply pred(view(i), value) for i in [lo, hi) */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, - typename ViewLike::size_type lo, typename ViewLike::size_type hi, - Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + typename ViewLike::size_type lo, typename ViewLike::size_type hi, Pred pred = Pred()) { using size_type = typename ViewLike::size_type; - static_assert(1 == ViewLike::rank, - "lower_bound_sequential_team requires rank-1 views"); + static_assert(1 == ViewLike::rank, "lower_bound_sequential_team requires rank-1 views"); static_assert(is_iota_v || Kokkos::is_view::value, "lower_bound_sequential_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); @@ -251,7 +233,7 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( Kokkos::TeamThreadRange(handle, lo, hi), [&](const size_type &i, size_type &li) { li = KOKKOSKERNELS_MACRO_MIN(li, hi); - if (i < li) { // no need to search higher than the smallest so far + if (i < li) { // no need to search higher than the smallest so far if (!pred(view(i), value)) { // look for the smallest index that does // not satisfy li = i; @@ -276,11 +258,10 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( \returns To all team members, the smallest i for which pred(view(i), value) is false or view.size() if no such value */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { return lower_bound_sequential_team(handle, view, value, 0, view.size(), pred); } @@ -310,10 +291,9 @@ struct Range { /// \brief maximizes the lower bound, and minimizes the upper bound of a Range template struct RangeReducer { - using reducer = RangeReducer; - using value_type = Range; - using result_view_type = - Kokkos::View *, Space, Kokkos::MemoryUnmanaged>; + using reducer = RangeReducer; + using value_type = Range; + using result_view_type = Kokkos::View *, Space, Kokkos::MemoryUnmanaged>; private: value_type &value; @@ -356,13 +336,11 @@ struct RangeReducer { false Once there are fewer values left than threads in the team, switch to team sequential search */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { - static_assert(1 == ViewLike::rank, - "lower_bound_kary_team requires rank-1 views"); + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, "lower_bound_kary_team requires rank-1 views"); static_assert(is_iota_v || Kokkos::is_view::value, "lower_bound_kary_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); @@ -378,9 +356,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( } // otherwise, split the region up among threads - size_type mid = - lo + (hi - lo) * (handle.team_rank() + 1) / (handle.team_size() + 1); - auto ve = view(mid); + size_type mid = lo + (hi - lo) * (handle.team_rank() + 1) / (handle.team_size() + 1); + auto ve = view(mid); // reduce across threads to figure out where the new search bounds are // if a thread satisfies the predicate, the first element that does not @@ -433,14 +410,12 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( Pred should be a binary function comparing two `typename View::non_const_value_type` */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_team requires rank-1 views"); - static_assert(KokkosKernels::Impl::is_iota_v || - Kokkos::is_view::value, + static_assert(KokkosKernels::Impl::is_iota_v || Kokkos::is_view::value, "lower_bound_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); diff --git a/common/src/KokkosKernels_Macros.hpp b/common/src/KokkosKernels_Macros.hpp index 04234a5ce2..6c4093ca10 100644 --- a/common/src/KokkosKernels_Macros.hpp +++ b/common/src/KokkosKernels_Macros.hpp @@ -34,15 +34,13 @@ // is enabled, since in that case, Kokkos::ThreadVectorRange should be used // instead for SIMD parallel loops. -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - defined(KOKKOS_ENABLE_OPENMP) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ENABLE_OPENMP) // For clang OpenMP support, see // https://clang.llvm.org/docs/OpenMPSupport.html#id1 #if defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) // GCC 4.8.5 and older do not support #pragma omp simd // Do not enable when using GCC 7.2.0 or 7.3.0 + C++17 due to a bug in gcc -#if (KOKKOS_COMPILER_GNU > 485) && \ - !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) && \ +#if (KOKKOS_COMPILER_GNU > 485) && !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) && \ !(KOKKOS_COMPILER_GNU == 730 && defined(KOKKOS_ENABLE_CXX17)) #define KOKKOSKERNELS_ENABLE_OMP_SIMD #endif @@ -99,9 +97,8 @@ // define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS if we are targeting a CUDA // architecture with "independent thread scheduling" (Volta70 and up). This // requires some extra logic in HashmapAccumulator to avoid data races. -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_ADA89) || \ - defined(KOKKOS_ARCH_HOPPER) +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ + defined(KOKKOS_ARCH_ADA89) || defined(KOKKOS_ARCH_HOPPER) #define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS #endif diff --git a/common/src/KokkosKernels_Predicates.hpp b/common/src/KokkosKernels_Predicates.hpp index a741d1353a..f3bc6f2b2c 100644 --- a/common/src/KokkosKernels_Predicates.hpp +++ b/common/src/KokkosKernels_Predicates.hpp @@ -32,17 +32,14 @@ namespace KokkosKernels { template struct GT { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /** * @brief Return true if a is greater than b * @param a First value to be compared * @param b Second value to be compared */ - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a > b; } }; @@ -53,13 +50,10 @@ struct GT { template struct GTE { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /// \brief return a >= b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a >= b; } }; @@ -70,13 +64,10 @@ struct GTE { template struct LT { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /// \brief return a < b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a < b; } }; @@ -87,13 +78,10 @@ struct LT { template struct LTE { using value_type = T; - static_assert(!Kokkos::ArithTraits::is_complex, - "Please define custom predicates for ordering complex types"); + static_assert(!Kokkos::ArithTraits::is_complex, "Please define custom predicates for ordering complex types"); /// \brief return a <= b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const noexcept { return a <= b; } }; @@ -106,10 +94,7 @@ struct Equal { using value_type = T; /// \brief return a == b - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const { - return a == b; - } + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const { return a == b; } }; /** @@ -133,8 +118,7 @@ struct Neg { * @param b Second value to be compared by the predicate * @return Boolean inverse of the result of the predicate applied to a and b */ - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const { return !pred_(a, b); } @@ -153,8 +137,7 @@ struct Refl { constexpr Refl(const Pred &pred) : pred_(pred) {} /// \brief return the underlying binary predicate with reversed arguments - KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, - const value_type &b) const { + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, const value_type &b) const { return pred_(b, a); } diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index c2e3a5187f..5870210912 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -37,8 +37,7 @@ inline void print_cublas_version_if_enabled(std::ostream& os) { inline void print_cusparse_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << cusparse_version_string() - << "\n"; + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << cusparse_version_string() << "\n"; #else os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; @@ -48,8 +47,7 @@ inline void print_cusparse_version_if_enabled(std::ostream& os) { inline void print_cusolver_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: " << cusolver_version_string() - << "\n"; + << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: " << cusolver_version_string() << "\n"; #else os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: no\n"; @@ -156,9 +154,8 @@ inline void print_version(std::ostream& os) { // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros // are not available in Kernels os << " " - << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION_MAJOR << "." - << KOKKOSKERNELS_VERSION_MINOR << "." << KOKKOSKERNELS_VERSION_PATCH - << '\n'; + << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION_MAJOR << "." << KOKKOSKERNELS_VERSION_MINOR << "." + << KOKKOSKERNELS_VERSION_PATCH << '\n'; } } // namespace Impl diff --git a/common/src/KokkosKernels_PrintUtils.hpp b/common/src/KokkosKernels_PrintUtils.hpp index 74b32c793a..b4817022fc 100644 --- a/common/src/KokkosKernels_PrintUtils.hpp +++ b/common/src/KokkosKernels_PrintUtils.hpp @@ -27,13 +27,11 @@ template struct Histogram { in_lno_view_t inview; out_lno_view_t outview; - Histogram(in_lno_view_t inview_, out_lno_view_t outview_) - : inview(inview_), outview(outview_) {} + Histogram(in_lno_view_t inview_, out_lno_view_t outview_) : inview(inview_), outview(outview_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t& ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; Kokkos::atomic_fetch_add(&(outview(inview(ii))), atomic_incr_type(1)); } }; @@ -47,13 +45,11 @@ struct Histogram { * them with 0, and size must be big enough to hold all values in input view. */ template -inline void kk_get_histogram( - typename in_lno_view_t::size_type in_elements, in_lno_view_t in_view, - out_lno_view_t histogram /*must be initialized with 0s*/) { +inline void kk_get_histogram(typename in_lno_view_t::size_type in_elements, in_lno_view_t in_view, + out_lno_view_t histogram /*must be initialized with 0s*/) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( - "KokkosKernels::Common::GetHistogram", my_exec_space(0, in_elements), - Histogram(in_view, histogram)); + Kokkos::parallel_for("KokkosKernels::Common::GetHistogram", my_exec_space(0, in_elements), + Histogram(in_view, histogram)); MyExecSpace().fence(); } @@ -68,9 +64,9 @@ inline void kk_get_histogram( * pritned. This parameter is not used if print_all is set to true. */ template -inline std::enable_if_t kk_print_1Dview( - std::ostream& os, idx_array_type view, bool print_all = false, - const char* sep = " ", size_t print_size = 40) { +inline std::enable_if_t kk_print_1Dview(std::ostream& os, idx_array_type view, + bool print_all = false, const char* sep = " ", + size_t print_size = 40) { typedef typename idx_array_type::HostMirror host_type; typedef typename idx_array_type::size_type idx; host_type host_view = Kokkos::create_mirror_view(view); @@ -95,12 +91,11 @@ inline std::enable_if_t kk_print_1Dview( * rank-2 vectors same like rank-1 vectors and prints multi-vector dimensions. */ template -inline std::enable_if_t= 2> kk_print_1Dview( - std::ostream& os, idx_array_type view, bool print_all = false, - const char* sep = " ", size_t print_size = 40) { +inline std::enable_if_t= 2> kk_print_1Dview(std::ostream& os, idx_array_type view, + bool print_all = false, const char* sep = " ", + size_t print_size = 40) { if (idx_array_type::rank == 2 && view.extent(1) == 1) { - kk_print_1Dview(os, subview(view, Kokkos::ALL, 0), print_all, sep, - print_size); + kk_print_1Dview(os, subview(view, Kokkos::ALL, 0), print_all, sep, print_size); return; } os << "[" << view.extent(0); @@ -120,8 +115,7 @@ inline std::enable_if_t= 2> kk_print_1Dview( * This interface is provided for backwards compatiblity. */ template -inline void kk_print_1Dview(idx_array_type view, bool print_all = false, - size_t print_size = 40) { +inline void kk_print_1Dview(idx_array_type view, bool print_all = false, size_t print_size = 40) { kk_print_1Dview(std::cout, view, print_all, " ", print_size); } diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 055c1d6d32..0ae29a2f50 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -21,8 +21,7 @@ #define KOKKOSKERNELS_MACRO_MIN(x, y) ((x) < (y) ? (x) : (y)) #define KOKKOSKERNELS_MACRO_MAX(x, y) ((x) < (y) ? (y) : (x)) -#define KOKKOSKERNELS_MACRO_ABS(x) \ - Kokkos::ArithTraits::type>::abs(x) +#define KOKKOSKERNELS_MACRO_ABS(x) Kokkos::ArithTraits::type>::abs(x) namespace KokkosKernels { @@ -53,8 +52,7 @@ struct ExclusiveParallelPrefixSum { KOKKOS_INLINE_FUNCTION void operator()(const size_t ii, value_type &update, const bool final) const { - value_type val = - (ii == array_sum.extent(0) - 1) ? value_type(0) : array_sum(ii); + value_type val = (ii == array_sum.extent(0) - 1) ? value_type(0) : array_sum(ii); if (final) { array_sum(ii) = value_type(update); } @@ -85,12 +83,10 @@ struct InclusiveParallelPrefixSum { * \param arr: the array for which the prefix sum will be performed. */ template -inline void kk_exclusive_parallel_prefix_sum( - const MyExecSpace &exec, typename view_t::value_type num_elements, - view_t arr) { +inline void kk_exclusive_parallel_prefix_sum(const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(exec, 0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr)); } @@ -101,8 +97,7 @@ inline void kk_exclusive_parallel_prefix_sum( * \param arr: the array for which the prefix sum will be performed. */ template -inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr) { +inline void kk_exclusive_parallel_prefix_sum(typename view_t::value_type num_elements, view_t arr) { kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr); } @@ -117,12 +112,10 @@ inline void kk_exclusive_parallel_prefix_sum( * prefix sum. */ template -inline void kk_exclusive_parallel_prefix_sum( - const MyExecSpace &exec, typename view_t::value_type num_elements, - view_t arr, typename view_t::non_const_value_type &finalSum) { +inline void kk_exclusive_parallel_prefix_sum(const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr, typename view_t::non_const_value_type &finalSum) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(exec, 0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr), finalSum); } @@ -136,9 +129,8 @@ inline void kk_exclusive_parallel_prefix_sum( * prefix sum. */ template -inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr, - typename view_t::non_const_value_type &finalSum) { +inline void kk_exclusive_parallel_prefix_sum(typename view_t::value_type num_elements, view_t arr, + typename view_t::non_const_value_type &finalSum) { kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } @@ -150,13 +142,10 @@ inline void kk_exclusive_parallel_prefix_sum( /// \param arr: the array for which the prefix sum will be performed. /// template -void kk_inclusive_parallel_prefix_sum( - MyExecSpace my_exec_space, - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void kk_inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, + forward_array_type arr) { typedef Kokkos::RangePolicy range_policy_t; - Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - range_policy_t(my_exec_space, 0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } @@ -167,9 +156,7 @@ void kk_inclusive_parallel_prefix_sum( /// \param arr: the array for which the prefix sum will be performed. /// template -void kk_inclusive_parallel_prefix_sum( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void kk_inclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr) { MyExecSpace my_exec_space; return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } @@ -180,9 +167,7 @@ struct ReductionFunctor { ReductionFunctor(view_t arr_) : array_sum(arr_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, typename view_t::value_type &update) const { - update += array_sum(ii); - } + void operator()(const size_t ii, typename view_t::value_type &update) const { update += array_sum(ii); } }; template @@ -191,55 +176,44 @@ struct ReductionFunctor2 { ReductionFunctor2(view_t arr_) : array_sum(arr_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, size_t &update) const { - update += array_sum(ii); - } + void operator()(const size_t ii, size_t &update) const { update += array_sum(ii); } }; template struct DiffReductionFunctor { view_t array_begins; view2_t array_ends; - DiffReductionFunctor(view_t begins, view2_t ends) - : array_begins(begins), array_ends(ends) {} + DiffReductionFunctor(view_t begins, view2_t ends) : array_begins(begins), array_ends(ends) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, - typename view_t::non_const_value_type &update) const { + void operator()(const size_t ii, typename view_t::non_const_value_type &update) const { update += (array_ends(ii) - array_begins(ii)); } }; template -inline void kk_reduce_diff_view( - size_t num_elements, view_t smaller, view2_t bigger, - typename view_t::non_const_value_type &reduction) { +inline void kk_reduce_diff_view(size_t num_elements, view_t smaller, view2_t bigger, + typename view_t::non_const_value_type &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), - DiffReductionFunctor(smaller, bigger), reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), + DiffReductionFunctor(smaller, bigger), reduction); } template struct DiffReductionFunctorP { const it *array_begins; const it *array_ends; - DiffReductionFunctorP(const it *begins, const it *ends) - : array_begins(begins), array_ends(ends) {} + DiffReductionFunctorP(const it *begins, const it *ends) : array_begins(begins), array_ends(ends) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii, it &update) const { - update += (array_ends[ii] - array_begins[ii]); - } + void operator()(const size_t ii, it &update) const { update += (array_ends[ii] - array_begins[ii]); } }; template -inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, - const it *bigger, it &reduction) { +inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, const it *bigger, it &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), - DiffReductionFunctorP(smaller, bigger), reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), + DiffReductionFunctorP(smaller, bigger), reduction); } /*** @@ -249,33 +223,27 @@ inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, * \param arr: the array for which the prefix sum will be performed. */ template -inline void kk_reduce_view(size_t num_elements, view_t arr, - typename view_t::value_type &reduction) { +inline void kk_reduce_view(size_t num_elements, view_t arr, typename view_t::value_type &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView", - my_exec_space(0, num_elements), + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView", my_exec_space(0, num_elements), ReductionFunctor(arr), reduction); } template -inline void kk_reduce_view2(size_t num_elements, view_t arr, - size_t &reduction) { +inline void kk_reduce_view2(size_t num_elements, view_t arr, size_t &reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView2", - my_exec_space(0, num_elements), + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView2", my_exec_space(0, num_elements), ReductionFunctor2(arr), reduction); } template ::mag_type> + typename eps_type = typename Kokkos::ArithTraits::mag_type> struct IsIdenticalFunctor { view_type1 view1; view_type2 view2; eps_type eps; - IsIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_) - : view1(view1_), view2(view2_), eps(eps_) {} + IsIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_) : view1(view1_), view2(view2_), eps(eps_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, size_t &is_equal) const { @@ -290,8 +258,7 @@ struct IsIdenticalFunctor { } }; -template +template bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { if (view1.extent(0) != view2.extent(0)) { return false; @@ -301,10 +268,8 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { typedef Kokkos::RangePolicy my_exec_space; size_t issame = 0; - Kokkos::parallel_reduce( - "KokkosKernels::Common::IsIdenticalView", my_exec_space(0, num_elements), - IsIdenticalFunctor(view1, view2, eps), - issame); + Kokkos::parallel_reduce("KokkosKernels::Common::IsIdenticalView", my_exec_space(0, num_elements), + IsIdenticalFunctor(view1, view2, eps), issame); MyExecSpace().fence(); if (issame > 0) { return false; @@ -314,15 +279,13 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { } template ::mag_type> + typename eps_type = typename Kokkos::ArithTraits::mag_type> struct IsRelativelyIdenticalFunctor { view_type1 view1; view_type2 view2; eps_type eps; - IsRelativelyIdenticalFunctor(view_type1 view1_, view_type2 view2_, - eps_type eps_) + IsRelativelyIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_) : view1(view1_), view2(view2_), eps(eps_) {} KOKKOS_INLINE_FUNCTION @@ -333,27 +296,22 @@ struct IsRelativelyIdenticalFunctor { typedef Kokkos::ArithTraits KATM; mag_type val_diff = KATM::zero(); - if (KAT::abs(view1(i)) > mag_type(eps) || - KAT::abs(view2(i)) > mag_type(eps)) { - val_diff = KAT::abs(view1(i) - view2(i)) / - (KAT::abs(view1(i)) + KAT::abs(view2(i))); + if (KAT::abs(view1(i)) > mag_type(eps) || KAT::abs(view2(i)) > mag_type(eps)) { + val_diff = KAT::abs(view1(i) - view2(i)) / (KAT::abs(view1(i)) + KAT::abs(view2(i))); } if (val_diff > mag_type(eps)) { Kokkos::printf( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " "(eps = %e, rel err = %e)\n", - (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), - KAT::imag(view2(i)), eps, val_diff); + (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), KAT::imag(view2(i)), eps, val_diff); num_diffs++; } } }; -template -bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, - eps_type eps) { +template +bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { if (view1.extent(0) != view2.extent(0)) { return false; } @@ -362,12 +320,9 @@ bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, typedef Kokkos::RangePolicy my_exec_space; size_t numDifferences = 0; - Kokkos::parallel_reduce( - "KokkosKernels::Common::IsRelativelyIdenticalView", - my_exec_space(0, num_elements), - IsRelativelyIdenticalFunctor( - view1, view2, eps), - numDifferences); + Kokkos::parallel_reduce("KokkosKernels::Common::IsRelativelyIdenticalView", my_exec_space(0, num_elements), + IsRelativelyIdenticalFunctor(view1, view2, eps), + numDifferences); return numDifferences == 0; } @@ -377,8 +332,7 @@ struct ReduceMaxFunctor { typedef typename view_type::non_const_value_type value_type; const value_type min_val; ReduceMaxFunctor(view_type view_to_reduce_) - : view_to_reduce(view_to_reduce_), - min_val((std::numeric_limits::lowest())) {} + : view_to_reduce(view_to_reduce_), min_val((std::numeric_limits::lowest())) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, value_type &max_reduction) const { value_type val = view_to_reduce(i); @@ -404,28 +358,24 @@ struct ReduceMaxFunctor { }; template -void kk_view_reduce_max( - size_t num_elements, view_type view_to_reduce, - typename view_type::non_const_value_type &max_reduction) { +void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce, + typename view_type::non_const_value_type &max_reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements), - ReduceMaxFunctor(view_to_reduce), max_reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements), + ReduceMaxFunctor(view_to_reduce), max_reduction); } // xorshift hash/pseudorandom function (supported for 32- and 64-bit integer // types only) template KOKKOS_FORCEINLINE_FUNCTION Value xorshiftHash(Value v) { - static_assert(std::is_unsigned::value, - "xorshiftHash: value must be an unsigned integer type"); + static_assert(std::is_unsigned::value, "xorshiftHash: value must be an unsigned integer type"); uint64_t x = v; x ^= x >> 12; x ^= x << 25; x ^= x >> 27; - return std::is_same::value - ? static_cast((x * 2685821657736338717ULL - 1) >> 16) - : static_cast(x * 2685821657736338717ULL - 1); + return std::is_same::value ? static_cast((x * 2685821657736338717ULL - 1) >> 16) + : static_cast(x * 2685821657736338717ULL - 1); } struct ViewHashFunctor { @@ -458,16 +408,14 @@ uint32_t hashView(const View &v) { // but it's not defined on Intel 19 (with GCC 7.2.0 standard library). // So just check if it's available before using. #ifdef __cpp_lib_has_unique_object_representations - static_assert(std::has_unique_object_representations< - typename View::non_const_value_type>::value, + static_assert(std::has_unique_object_representations::value, "KokkosKernels::Impl::hashView: the view's element type must " "not have any padding bytes."); #endif size_t nbytes = v.span() * sizeof(typename View::value_type); uint32_t h; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, nbytes), - ViewHashFunctor(reinterpret_cast(v.data())), h); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, nbytes), + ViewHashFunctor(reinterpret_cast(v.data())), h); return h; } @@ -476,18 +424,15 @@ struct SequentialFillFunctor { using size_type = typename V::size_type; using val_type = typename V::non_const_value_type; SequentialFillFunctor(const V &v_, val_type start_) : v(v_), start(start_) {} - KOKKOS_INLINE_FUNCTION void operator()(size_type i) const { - v(i) = start + (val_type)i; - } + KOKKOS_INLINE_FUNCTION void operator()(size_type i) const { v(i) = start + (val_type)i; } V v; val_type start; }; template void sequential_fill(const V &v, typename V::non_const_value_type start = 0) { - Kokkos::parallel_for( - Kokkos::RangePolicy(0, v.extent(0)), - SequentialFillFunctor(v, start)); + Kokkos::parallel_for(Kokkos::RangePolicy(0, v.extent(0)), + SequentialFillFunctor(v, start)); } } // namespace Impl diff --git a/common/src/KokkosKernels_Sorting.hpp b/common/src/KokkosKernels_Sorting.hpp index 20ce6deaa2..f91f11c164 100644 --- a/common/src/KokkosKernels_Sorting.hpp +++ b/common/src/KokkosKernels_Sorting.hpp @@ -17,7 +17,7 @@ #define _KOKKOSKERNELS_SORTING_HPP #include "Kokkos_Core.hpp" -#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum +#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum #include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space #include @@ -26,10 +26,7 @@ namespace KokkosKernels { namespace Impl { template struct DefaultComparator { - KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, - const Value rhs) const { - return lhs < rhs; - } + KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const { return lhs < rhs; } }; } // namespace Impl @@ -39,9 +36,8 @@ struct DefaultComparator { // Bitonic sort: sorts v according to the comparator object's operator(). // Default comparator is just operator< for v's element type. -template < - typename View, typename ExecSpace, typename Ordinal, - typename Comparator = Impl::DefaultComparator> +template > void bitonicSort(View v, const Comparator& comp = Comparator()); // -------------------------------------------------------- @@ -51,15 +47,12 @@ void bitonicSort(View v, const Comparator& comp = Comparator()); // Radix sort. Not in-place: requires scratch array 'valuesAux' to be the same // size as values. ValueType must be an unsigned integer type. template -KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, - ValueType* valuesAux, Ordinal n); +KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n); // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. template -KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, - ValueType* valuesAux, - PermType* perm, PermType* permAux, +KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n); // ------------------------------------------------------------------- @@ -70,39 +63,32 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, // raw array according to the comparator. template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort( - ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); +KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, + const Comparator& comp = Comparator()); // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( - ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); +KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, + const Comparator& comp = Comparator()); namespace Impl { // Functor that sorts a view on one team -template +template struct BitonicSingleTeamFunctor { - BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) - : v(v_), comp(comp_) {} + BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - KokkosKernels::TeamBitonicSort( - v.data(), v.extent(0), t, comp); + KokkosKernels::TeamBitonicSort(v.data(), v.extent(0), t, + comp); }; View v; Comparator comp; }; // Functor that sorts equally sized chunks on each team -template +template struct BitonicChunkFunctor { BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {} @@ -111,9 +97,8 @@ struct BitonicChunkFunctor { Ordinal chunkStart = chunk * chunkSize; Ordinal n = chunkSize; if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - KokkosKernels::TeamBitonicSort( - v.data() + chunkStart, n, t, comp); + KokkosKernels::TeamBitonicSort(v.data() + chunkStart, n, + t, comp); }; View v; Comparator comp; @@ -122,12 +107,10 @@ struct BitonicChunkFunctor { // Functor that does just the first phase (brown) of bitonic sort on // equally-sized chunks -template +template struct BitonicPhase1Functor { typedef typename View::value_type Value; - BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, - Ordinal teamsPerBox_) + BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Ordinal box = t.league_rank() / teamsPerBox; @@ -135,18 +118,17 @@ struct BitonicPhase1Functor { Ordinal work = boxSize / teamsPerBox / 2; Ordinal workStart = work * (t.league_rank() % teamsPerBox); Ordinal workReflect = boxSize - workStart - 1; - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [&](const Ordinal i) { - Ordinal elem1 = boxStart + workStart + i; - Ordinal elem2 = boxStart + workReflect - i; - if (elem2 < Ordinal(v.extent(0))) { - if (comp(v(elem2), v(elem1))) { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { + Ordinal elem1 = boxStart + workStart + i; + Ordinal elem2 = boxStart + workReflect - i; + if (elem2 < Ordinal(v.extent(0))) { + if (comp(v(elem2), v(elem1))) { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); }; View v; Comparator comp; @@ -155,12 +137,10 @@ struct BitonicPhase1Functor { }; // Functor that does the second phase (red) of bitonic sort -template +template struct BitonicPhase2Functor { typedef typename View::value_type Value; - BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, - Ordinal teamsPerBox_) + BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Ordinal logBoxSize = 1; @@ -170,18 +150,17 @@ struct BitonicPhase2Functor { Ordinal work = boxSize / teamsPerBox / 2; Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox); Ordinal jump = boxSize / 2; - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [&](const Ordinal i) { - Ordinal elem1 = workStart + i; - Ordinal elem2 = workStart + jump + i; - if (elem2 < Ordinal(v.extent(0))) { - if (comp(v(elem2), v(elem1))) { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { + Ordinal elem1 = workStart + i; + Ordinal elem2 = workStart + jump + i; + if (elem2 < Ordinal(v.extent(0))) { + if (comp(v(elem2), v(elem1))) { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); if (teamsPerBox == 1) { // This team can finish phase 2 for all the smaller red boxes that follow, // since there are no longer cross-team data dependencies @@ -189,26 +168,23 @@ struct BitonicPhase2Functor { t.team_barrier(); Ordinal logSubBoxSize = logBoxSize - subLevel; Ordinal subBoxSize = Ordinal(1) << logSubBoxSize; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { - Ordinal globalThread = i + t.league_rank() * work; - Ordinal subBox = globalThread >> (logSubBoxSize - 1); - Ordinal subBoxStart = subBox << logSubBoxSize; - Ordinal subBoxOffset = - globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - - 1); // i % (subBoxSize / 2) - Ordinal elem1 = subBoxStart + subBoxOffset; - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + subBoxSize / 2; - if (elem2 < Ordinal(v.extent(0))) { - if (comp(v(elem2), v(elem1))) { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) { + Ordinal globalThread = i + t.league_rank() * work; + Ordinal subBox = globalThread >> (logSubBoxSize - 1); + Ordinal subBoxStart = subBox << logSubBoxSize; + Ordinal subBoxOffset = globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - 1); // i % (subBoxSize / 2) + Ordinal elem1 = subBoxStart + subBoxOffset; + // later phases (pink box): within a block, compare with fixed + // distance (boxSize / 2) apart + Ordinal elem2 = elem1 + subBoxSize / 2; + if (elem2 < Ordinal(v.extent(0))) { + if (comp(v(elem2), v(elem1))) { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); } } }; @@ -228,18 +204,15 @@ struct BitonicPhase2Functor { // type and an arbitrary device-compatible comparison operator (provided through // operator() of Comparator) If comparator is void, use operator< (which should // only be used for primitives) -template +template void bitonicSort(View v, const Comparator& comp) { typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; Ordinal n = v.extent(0); // If n is small, just sort on a single team if (n <= Ordinal(1) << 12) { - Kokkos::parallel_for( - team_policy(1, Kokkos::AUTO()), - Impl::BitonicSingleTeamFunctor( - v, comp)); + Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()), + Impl::BitonicSingleTeamFunctor(v, comp)); } else { Ordinal npot = 1; while (npot < n) npot <<= 1; @@ -247,22 +220,17 @@ void bitonicSort(View v, const Comparator& comp) { Ordinal chunkSize = 512; Ordinal numTeams = npot / chunkSize; // First, sort within teams - Kokkos::parallel_for( - team_policy(numTeams, Kokkos::AUTO()), - Impl::BitonicChunkFunctor( - v, comp, chunkSize)); - for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; - teamsPerBox *= 2) { + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Impl::BitonicChunkFunctor(v, comp, chunkSize)); + for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) { Ordinal boxSize = teamsPerBox * chunkSize; Kokkos::parallel_for( team_policy(numTeams, Kokkos::AUTO()), - Impl::BitonicPhase1Functor( - v, comp, boxSize, teamsPerBox)); + Impl::BitonicPhase1Functor(v, comp, boxSize, teamsPerBox)); for (int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) { - Kokkos::parallel_for( - team_policy(numTeams, Kokkos::AUTO()), - Impl::BitonicPhase2Functor( - v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Impl::BitonicPhase2Functor( + v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); } } } @@ -273,11 +241,9 @@ void bitonicSort(View v, const Comparator& comp) { // Better on CPU cores. Con: requires auxiliary storage, and this version only // works for integers template -KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, - ValueType* valuesAux, Ordinal n) { - static_assert( - std::is_integral::value && std::is_unsigned::value, - "radixSort can only be run on unsigned integers."); +KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) { + static_assert(std::is_integral::value && std::is_unsigned::value, + "radixSort can only be run on unsigned integers."); if (n <= 1) return; ValueType maxVal = 0; for (Ordinal i = 0; i < n; i++) { @@ -318,13 +284,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, // threads if (!inAux) { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (values[i] & mask) >> maskPos; + Ordinal bucket = (values[i] & mask) >> maskPos; valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; count[bucket]--; } } else { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (valuesAux[i] & mask) >> maskPos; + Ordinal bucket = (valuesAux[i] & mask) >> maskPos; values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; count[bucket]--; } @@ -348,13 +314,10 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, // lane. Con: requires auxiliary storage, this version only works for integers // (although float/double is possible) template -KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, - ValueType* valuesAux, - PermType* perm, PermType* permAux, +KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n) { - static_assert( - std::is_integral::value && std::is_unsigned::value, - "radixSort can only be run on unsigned integers."); + static_assert(std::is_integral::value && std::is_unsigned::value, + "radixSort can only be run on unsigned integers."); if (n <= 1) return; ValueType maxVal = 0; for (Ordinal i = 0; i < n; i++) { @@ -394,14 +357,14 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, // threads if (!inAux) { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (values[i] & mask) >> maskPos; + Ordinal bucket = (values[i] & mask) >> maskPos; valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; permAux[offset[bucket + 1] - count[bucket]] = perm[i]; count[bucket]--; } } else { for (Ordinal i = 0; i < n; i++) { - Ordinal bucket = (valuesAux[i] & mask) >> maskPos; + Ordinal bucket = (valuesAux[i] & mask) >> maskPos; values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; perm[offset[bucket + 1] - count[bucket]] = permAux[i]; count[bucket]--; @@ -425,10 +388,8 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, // trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and // memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs // Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, - const TeamMember mem, +template +KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp) { // Algorithm only works on power-of-two input size only. // If n is not a power-of-two, will implicitly pretend @@ -443,52 +404,49 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, for (Ordinal i = 0; i < levels; i++) { for (Ordinal j = 0; j <= i; j++) { // n/2 pairs of items are compared in parallel - Kokkos::parallel_for( - Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } + Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { + // How big are the brown/pink boxes? + Ordinal boxSize = Ordinal(2) << (i - j); + // Which box contains this thread? + Ordinal boxID = t >> (i - j); // t * 2 / boxSize; + Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize + Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / + // 2; + Ordinal elem1 = boxStart + boxOffset; + if (j == 0) { + // first phase (brown box): within a block, compare with the + // opposite value in the box + Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; + if (elem2 < n) { + // both elements in bounds, so compare them and swap if out of + // order + if (comp(values[elem2], values[elem1])) { + ValueType temp = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp; + } + } + } else { + // later phases (pink box): within a block, compare with fixed + // distance (boxSize / 2) apart + Ordinal elem2 = elem1 + boxSize / 2; + if (elem2 < n) { + if (comp(values[elem2], values[elem1])) { + ValueType temp = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp; } - }); + } + } + }); mem.team_barrier(); } } } // Sort "values", while applying the same swaps to "perm" -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, - Ordinal n, const TeamMember mem, +template +KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp) { // Algorithm only works on power-of-two input size only. // If n is not a power-of-two, will implicitly pretend @@ -503,48 +461,47 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, for (Ordinal i = 0; i < levels; i++) { for (Ordinal j = 0; j <= i; j++) { // n/2 pairs of items are compared in parallel - Kokkos::parallel_for( - Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } + Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { + // How big are the brown/pink boxes? + Ordinal boxSize = Ordinal(2) << (i - j); + // Which box contains this thread? + Ordinal boxID = t >> (i - j); // t * 2 / boxSize; + Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize + Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / + // 2; + Ordinal elem1 = boxStart + boxOffset; + if (j == 0) { + // first phase (brown box): within a block, compare with the + // opposite value in the box + Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; + if (elem2 < n) { + // both elements in bounds, so compare them and swap if out of + // order + if (comp(values[elem2], values[elem1])) { + ValueType temp1 = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp1; + PermType temp2 = perm[elem1]; + perm[elem1] = perm[elem2]; + perm[elem2] = temp2; + } + } + } else { + // later phases (pink box): within a block, compare with fixed + // distance (boxSize / 2) apart + Ordinal elem2 = elem1 + boxSize / 2; + if (elem2 < n) { + if (comp(values[elem2], values[elem1])) { + ValueType temp1 = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp1; + PermType temp2 = perm[elem1]; + perm[elem1] = perm[elem2]; + perm[elem2] = temp2; } - }); + } + } + }); mem.team_barrier(); } } @@ -554,49 +511,40 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, // KokkosKernels::Impl:: namespace Impl { -template < - typename View, typename ExecSpace, typename Ordinal, - typename Comparator = Impl::DefaultComparator> +template > [[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) { KokkosKernels::bitonicSort(v, comp); } template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, - ValueType* valuesAux, - Ordinal n) { +[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) { KokkosKernels::SerialRadixSort(values, valuesAux, n); } // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2( - ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, - Ordinal n) { - KokkosKernels::SerialRadixSort2( - values, valuesAux, perm, permAux, n); +[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, + PermType* permAux, Ordinal n) { + KokkosKernels::SerialRadixSort2(values, valuesAux, perm, permAux, n); } template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort( - ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort( - values, n, mem, comp); +[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, + const Comparator& comp = Comparator()) { + KokkosKernels::TeamBitonicSort(values, n, mem, comp); } // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts // values[0...n]. -template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( - ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); +[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, + const TeamMember mem, + const Comparator& comp = Comparator()) { + KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); } } // namespace Impl diff --git a/common/src/KokkosKernels_TplsVersion.hpp b/common/src/KokkosKernels_TplsVersion.hpp index 3e00d72457..692f0fd350 100644 --- a/common/src/KokkosKernels_TplsVersion.hpp +++ b/common/src/KokkosKernels_TplsVersion.hpp @@ -50,8 +50,7 @@ inline std::string cusparse_version_string() { // Print version std::stringstream ss; - ss << CUSPARSE_VER_MAJOR << "." << CUSPARSE_VER_MINOR << "." - << CUSPARSE_VER_PATCH << "." << CUSPARSE_VER_BUILD; + ss << CUSPARSE_VER_MAJOR << "." << CUSPARSE_VER_MINOR << "." << CUSPARSE_VER_PATCH << "." << CUSPARSE_VER_BUILD; return ss.str(); } @@ -61,8 +60,7 @@ inline std::string cusparse_version_string() { inline std::string cusolver_version_string() { std::stringstream ss; - ss << CUSOLVER_VER_MAJOR << "." << CUSOLVER_VER_MINOR << "." - << CUSOLVER_VER_PATCH << "." << CUSOLVER_VER_BUILD; + ss << CUSOLVER_VER_MAJOR << "." << CUSOLVER_VER_MINOR << "." << CUSOLVER_VER_PATCH << "." << CUSOLVER_VER_BUILD; return ss.str(); } diff --git a/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp b/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp index e40b81a762..aa477815d6 100644 --- a/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp +++ b/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp @@ -176,10 +176,8 @@ class UniformMemoryPool { * initialized_value: the value to initialize \param pool_type_: whether * ManyThread2OneChunk or OneThread2OneChunk */ - UniformMemoryPool(const size_t num_chunks_, const size_t set_chunk_size_, - const data_type initialized_value = 0, - const PoolType pool_type_ = OneThread2OneChunk, - bool initialize = true) + UniformMemoryPool(const size_t num_chunks_, const size_t set_chunk_size_, const data_type initialized_value = 0, + const PoolType pool_type_ = OneThread2OneChunk, bool initialize = true) : num_chunks(1), num_set_chunks(num_chunks_), modular_num_chunks(0), @@ -200,9 +198,7 @@ class UniformMemoryPool { modular_num_chunks = num_chunks - 1; overall_size = num_chunks * chunk_size; if (num_set_chunks > 0) { - data_view = data_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "pool data"), - overall_size); + data_view = data_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "pool data"), overall_size); } data = (data_view.data()); @@ -233,9 +229,9 @@ class UniformMemoryPool { ~UniformMemoryPool() = default; - UniformMemoryPool(UniformMemoryPool &&) = default; - UniformMemoryPool(const UniformMemoryPool &) = default; - UniformMemoryPool &operator=(UniformMemoryPool &&) = default; + UniformMemoryPool(UniformMemoryPool &&) = default; + UniformMemoryPool(const UniformMemoryPool &) = default; + UniformMemoryPool &operator=(UniformMemoryPool &&) = default; UniformMemoryPool &operator=(const UniformMemoryPool &) = default; /** @@ -295,12 +291,10 @@ class UniformMemoryPool { } KOKKOS_INLINE_FUNCTION - data_type *get_arbitrary_free_chunk(const size_t &thread_index, - const size_t max_tries) const { + data_type *get_arbitrary_free_chunk(const size_t &thread_index, const size_t max_tries) const { size_t chunk_index = thread_index & modular_num_chunks; size_t num_try = 0; - while (!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, - 0, 1)) { + while (!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, 0, 1)) { chunk_index = (chunk_index + 1) & modular_num_chunks; ++num_try; if (num_try > max_tries) { @@ -344,9 +338,7 @@ class UniformMemoryPool { * \brief Returns the chunk index of the pointer. */ KOKKOS_INLINE_FUNCTION - size_t get_chunk_index(const data_type *chunk_ptr) const { - return (chunk_ptr - data) / chunk_size; - } + size_t get_chunk_index(const data_type *chunk_ptr) const { return (chunk_ptr - data) / chunk_size; } /** * \brief Releases the memory that has been allocated. diff --git a/common/src/KokkosKernels_UpperBound.hpp b/common/src/KokkosKernels_UpperBound.hpp index 901c865743..97efd7559c 100644 --- a/common/src/KokkosKernels_UpperBound.hpp +++ b/common/src/KokkosKernels_UpperBound.hpp @@ -70,11 +70,9 @@ namespace KokkosKernels { \returns index of first element in view where pred(value,element) is true, or view.size if no such element exists */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_thread( - const ViewLike &view, const typename ViewLike::non_const_value_type &value, - Pred pred = Pred()) { + const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { return lower_bound_thread(view, value, Neg(Refl(pred))); } @@ -88,11 +86,10 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_thread( \returns index of first element in view where pred(value,element) is true, or view.size if no such element exists */ -template > +template > KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_team( - const TeamMember &handle, const ViewLike &view, - const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { return lower_bound_team(handle, view, value, Neg(Refl(pred))); } diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 92419424b6..a087002d31 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -36,31 +36,26 @@ ExecSpaceType get_exec_space_type() { return kk_get_exec_space_type(); } -inline int get_suggested_vector__size(size_t nr, size_t nnz, - ExecSpaceType exec_space) { +inline int get_suggested_vector__size(size_t nr, size_t nnz, ExecSpaceType exec_space) { return kk_get_suggested_vector_size(nr, nnz, exec_space); } template -void get_histogram(typename in_lno_view_t::size_type in_elements, - in_lno_view_t in_view, +void get_histogram(typename in_lno_view_t::size_type in_elements, in_lno_view_t in_view, out_lno_view_t histogram /*must be initialized with 0s*/) { - kk_get_histogram( - in_elements, in_view, histogram); + kk_get_histogram(in_elements, in_view, histogram); } template void get_suggested_vector_size(int &suggested_vector_size_, idx nr, idx nnz) { - suggested_vector_size_ = kk_get_suggested_vector_size( - nr, nnz, get_exec_space_type()); + suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type()); } // Get the best team size for the given functor. // If it uses shared memory, the amount used must be available through // f.team_shmem_size(n), not through the TeamPolicy. If this is how dynamic // shared is set, just use AUTO for the team size. -template +template int get_suggested_team_size(Functor &f, int vector_size) { using execution_space = typename team_policy_t::traits::execution_space; if (kk_is_gpu_exec_space()) { @@ -70,23 +65,18 @@ int get_suggested_team_size(Functor &f, int vector_size) { return 1; } -template -int get_suggested_team_size(Functor &f, int vector_size, size_t sharedPerTeam, - size_t sharedPerThread) { +template +int get_suggested_team_size(Functor &f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { using execution_space = typename team_policy_t::traits::execution_space; if (kk_is_gpu_exec_space()) { - team_policy_t temp = - team_policy_t(1, 1, vector_size) - .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), - Kokkos::PerThread(sharedPerThread)); + team_policy_t temp = team_policy_t(1, 1, vector_size) + .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); return temp.team_size_recommended(f, ParallelTag()); } else return 1; } -template +template struct FillSymmetricEdges { typedef typename idx_array_type::value_type idx; idx num_rows; @@ -97,44 +87,35 @@ struct FillSymmetricEdges { idx_out_edge_array_type srcs; idx_out_edge_array_type dsts; - FillSymmetricEdges(typename idx_array_type::value_type num_rows_, - idx_array_type xadj_, idx_edge_array_type adj_, + FillSymmetricEdges(typename idx_array_type::value_type num_rows_, idx_array_type xadj_, idx_edge_array_type adj_, - idx_out_edge_array_type srcs_, - idx_out_edge_array_type dsts_) - : num_rows(num_rows_), - nnz(adj_.extent(0)), - xadj(xadj_), - adj(adj_), - srcs(srcs_), - dsts(dsts_) {} + idx_out_edge_array_type srcs_, idx_out_edge_array_type dsts_) + : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), adj(adj_), srcs(srcs_), dsts(dsts_) {} KOKKOS_INLINE_FUNCTION void operator()(const team_member &teamMember) const { - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) return; idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - srcs[adjind] = ii + 1; - dsts[adjind] = colIndex + 1; - if (colIndex != ii) { - srcs[adjind + nnz] = colIndex + 1; - dsts[adjind + nnz] = ii + 1; - } - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + srcs[adjind] = ii + 1; + dsts[adjind] = colIndex + 1; + if (colIndex != ii) { + srcs[adjind + nnz] = colIndex + 1; + dsts[adjind + nnz] = ii + 1; + } + } + }); } }; -template +template struct FillSymmetricEdgesHashMap { typedef typename in_lno_row_view_t::value_type idx; idx num_rows; @@ -145,60 +126,47 @@ struct FillSymmetricEdgesHashMap { out_lno_row_view_t pre_pps; bool lower_only; - FillSymmetricEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, + FillSymmetricEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, out_lno_row_view_t pre_pps_) - : num_rows(num_rows_), - nnz(adj_.extent(0)), - xadj(xadj_), - adj(adj_), - umap(hashmap_), - pre_pps(pre_pps_) {} + : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), adj(adj_), umap(hashmap_), pre_pps(pre_pps_) {} KOKKOS_INLINE_FUNCTION void operator()(const team_member &teamMember /*, idx &nnz*/) const { - typedef typename std::remove_reference::type - atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(colIndex, ii)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - - Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - } - } else if (colIndex > ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(ii, colIndex)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - } - } else { - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(colIndex, ii)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + + Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); } - }); + } else if (colIndex > ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(ii, colIndex)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + } + } else { + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + } + } + }); } }; -template +template struct FillSymmetricLowerEdgesHashMap { typedef typename in_lno_row_view_t::value_type idx; idx num_rows; @@ -208,55 +176,41 @@ struct FillSymmetricLowerEdgesHashMap { hashmap_t umap; out_lno_row_view_t pre_pps; - FillSymmetricLowerEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, - out_lno_row_view_t pre_pps_, - bool /* lower_only_ */ = false) - : num_rows(num_rows_), - nnz(adj_.extent(0)), - xadj(xadj_), - adj(adj_), - umap(hashmap_), - pre_pps(pre_pps_) {} + FillSymmetricLowerEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, + out_lno_row_view_t pre_pps_, bool /* lower_only_ */ = false) + : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), adj(adj_), umap(hashmap_), pre_pps(pre_pps_) {} KOKKOS_INLINE_FUNCTION void operator()(const team_member &teamMember /*, idx &nnz*/) const { - typedef typename std::remove_reference::type - atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(colIndex, ii)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - } - } else if (colIndex > ii) { - Kokkos::UnorderedMapInsertResult r = - umap.insert(Kokkos::pair(ii, colIndex)); - if (r.success()) { - Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(colIndex, ii)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + } + } else if (colIndex > ii) { + Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair(ii, colIndex)); + if (r.success()) { + Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); } - }); + } + } + }); } }; -template struct FillSymmetricCRS_HashMap { typedef typename in_lno_row_view_t::value_type idx; @@ -268,10 +222,8 @@ struct FillSymmetricCRS_HashMap { out_lno_row_view_t pre_pps; out_lno_nnz_view_t sym_adj; - FillSymmetricCRS_HashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, - out_lno_row_view_t pre_pps_, - out_lno_nnz_view_t sym_adj_) + FillSymmetricCRS_HashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, + out_lno_row_view_t pre_pps_, out_lno_nnz_view_t sym_adj_) : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), @@ -282,51 +234,42 @@ struct FillSymmetricCRS_HashMap { KOKKOS_INLINE_FUNCTION void operator()(const team_member_t &teamMember) const { - typedef typename std::remove_reference::type - atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), - atomic_incr_type(1)); - sym_adj[cAdjInd] = ii; - sym_adj[iAdjInd] = colIndex; - } - } else if (colIndex > ii) { - if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), - atomic_incr_type(1)); - sym_adj[cAdjInd] = ii; - sym_adj[iAdjInd] = colIndex; - } - } else { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), - atomic_incr_type(1)); - sym_adj[cAdjInd] = ii; - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + sym_adj[cAdjInd] = ii; + sym_adj[iAdjInd] = colIndex; } - }); + } else if (colIndex > ii) { + if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1)); + sym_adj[cAdjInd] = ii; + sym_adj[iAdjInd] = colIndex; + } + } else { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1)); + sym_adj[cAdjInd] = ii; + } + } + }); } }; -template struct FillSymmetricEdgeList_HashMap { typedef typename in_lno_row_view_t::value_type idx; @@ -339,11 +282,8 @@ struct FillSymmetricEdgeList_HashMap { out_lno_nnz_view_t sym_dst; out_lno_row_view_t pps; - FillSymmetricEdgeList_HashMap(idx num_rows_, in_lno_row_view_t xadj_, - in_lno_nnz_view_t adj_, hashmap_t hashmap_, - out_lno_nnz_view_t sym_src_, - out_lno_nnz_view_t sym_dst_, - out_lno_row_view_t pps_) + FillSymmetricEdgeList_HashMap(idx num_rows_, in_lno_row_view_t xadj_, in_lno_nnz_view_t adj_, hashmap_t hashmap_, + out_lno_nnz_view_t sym_src_, out_lno_nnz_view_t sym_dst_, out_lno_row_view_t pps_) : num_rows(num_rows_), nnz(adj_.extent(0)), xadj(xadj_), @@ -355,44 +295,38 @@ struct FillSymmetricEdgeList_HashMap { KOKKOS_INLINE_FUNCTION void operator()(const team_member_t &teamMember) const { - typedef - typename std::remove_reference::type atomic_incr_type; - idx ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + typedef typename std::remove_reference::type atomic_incr_type; + idx ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= num_rows) { return; } idx row_begin = xadj[ii]; idx row_end = xadj[ii + 1]; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { - idx adjind = i + row_begin; - idx colIndex = adj[adjind]; - if (colIndex < num_rows) { - if (colIndex < ii) { - if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { - idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(colIndex)), - atomic_incr_type(1)); - sym_src[cAdjInd] = colIndex; - sym_dst[cAdjInd] = ii; - } - } else if (colIndex > ii) { - if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { - idx cAdjInd = - Kokkos::atomic_fetch_add(&(pps(ii)), atomic_incr_type(1)); - sym_src[cAdjInd] = ii; - sym_dst[cAdjInd] = colIndex; - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) { + idx adjind = i + row_begin; + idx colIndex = adj[adjind]; + if (colIndex < num_rows) { + if (colIndex < ii) { + if (umap.insert(Kokkos::pair(colIndex, ii)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(colIndex)), atomic_incr_type(1)); + sym_src[cAdjInd] = colIndex; + sym_dst[cAdjInd] = ii; } - }); + } else if (colIndex > ii) { + if (umap.insert(Kokkos::pair(ii, colIndex)).success()) { + idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(ii)), atomic_incr_type(1)); + sym_src[cAdjInd] = ii; + sym_dst[cAdjInd] = colIndex; + } + } + } + }); } }; template -void print_1Dview(std::ostream &os, idx_array_type view, bool print_all = false, - const char *sep = " ") { +void print_1Dview(std::ostream &os, idx_array_type view, bool print_all = false, const char *sep = " ") { kk_print_1Dview(os, view, print_all, sep); } @@ -403,8 +337,7 @@ void print_1Dview(idx_array_type view, bool print_all = false) { template void print_1Dpointer(const lno_t *pview, size_t size, bool print_all = false) { - typedef Kokkos::View - um_array_type; + typedef Kokkos::View um_array_type; um_array_type view(pview, size); kk_print_1Dview(view, print_all); } @@ -415,14 +348,12 @@ struct Reverse_Map_Init { typedef typename reverse_map_type::value_type reverse_type; forward_map_type forward_map; reverse_map_type reverse_map_xadj; - Reverse_Map_Init(forward_map_type forward_map_, - reverse_map_type reverse_xadj_) + Reverse_Map_Init(forward_map_type forward_map_, reverse_map_type reverse_xadj_) : forward_map(forward_map_), reverse_map_xadj(reverse_xadj_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type fm = forward_map[ii]; Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm)), atomic_incr_type(1)); } @@ -436,44 +367,32 @@ struct Fill_Reverse_Map { reverse_map_type reverse_map_xadj; reverse_map_type reverse_map_adj; - Fill_Reverse_Map(forward_map_type forward_map_, - reverse_map_type reverse_map_xadj_, - reverse_map_type reverse_map_adj_) - : forward_map(forward_map_), - reverse_map_xadj(reverse_map_xadj_), - reverse_map_adj(reverse_map_adj_) {} + Fill_Reverse_Map(forward_map_type forward_map_, reverse_map_type reverse_map_xadj_, reverse_map_type reverse_map_adj_) + : forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), reverse_map_adj(reverse_map_adj_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type c = forward_map[ii]; - const reverse_type future_index = Kokkos::atomic_fetch_add( - &(reverse_map_xadj(c - 1)), atomic_incr_type(1)); - reverse_map_adj(future_index) = ii; + const reverse_type future_index = Kokkos::atomic_fetch_add(&(reverse_map_xadj(c - 1)), atomic_incr_type(1)); + reverse_map_adj(future_index) = ii; } }; template -void inclusive_parallel_prefix_sum( - MyExecSpace my_exec_space, - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, + forward_array_type arr) { return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template -void inclusive_parallel_prefix_sum( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void inclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr) { MyExecSpace my_exec_space; return inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template -void exclusive_parallel_prefix_sum( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void exclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr) { kk_exclusive_parallel_prefix_sum(num_elements, arr); } @@ -499,21 +418,16 @@ struct PropogataMaxValstoZeros { } }; -template -void a_times_x_plus_b(typename in_array_t::value_type num_elements, - in_array_t out_arr, in_array_t in_arr, scalar_1 a, +template +void a_times_x_plus_b(typename in_array_t::value_type num_elements, in_array_t out_arr, in_array_t in_arr, scalar_1 a, scalar_2 b) { - kk_a_times_x_plus_b( - num_elements, out_arr, in_arr, a, b); + kk_a_times_x_plus_b(num_elements, out_arr, in_arr, a, b); } template -void modular_view(typename in_array_type::value_type num_elements, - out_array_type out_arr, in_array_type in_arr, +void modular_view(typename in_array_type::value_type num_elements, out_array_type out_arr, in_array_type in_arr, int mod_factor_) { - kk_modular_view( - num_elements, out_arr, in_arr, mod_factor_); + kk_modular_view(num_elements, out_arr, in_arr, mod_factor_); } template @@ -528,18 +442,14 @@ struct LinearInitialization { template void linear_init(typename array_type::value_type num_elements, array_type arr) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::LinearInit", - my_exec_space(0, num_elements), + Kokkos::parallel_for("KokkosKernels::Common::LinearInit", my_exec_space(0, num_elements), LinearInitialization(arr)); } template -void remove_zeros_in_xadj_vector( - typename forward_array_type::value_type num_elements, - forward_array_type arr) { +void remove_zeros_in_xadj_vector(typename forward_array_type::value_type num_elements, forward_array_type arr) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan("KokkosKernels::Common::RemoveZerosInXadjVector", - my_exec_space(0, num_elements), + Kokkos::parallel_scan("KokkosKernels::Common::RemoveZerosInXadjVector", my_exec_space(0, num_elements), PropogataMaxValstoZeros(arr)); } @@ -548,10 +458,9 @@ struct FillReverseBegins { const forward_array_type &forward_map; // vertex to colors reverse_array_type &reverse_map_xadj; // colors to vertex xadj - FillReverseBegins( - const forward_array_type &forward_map_, // vertex to colors - reverse_array_type &reverse_map_xadj_ // colors to vertex xadj - ) + FillReverseBegins(const forward_array_type &forward_map_, // vertex to colors + reverse_array_type &reverse_map_xadj_ // colors to vertex xadj + ) : forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_) {} KOKKOS_INLINE_FUNCTION @@ -575,10 +484,8 @@ struct Reverse_Map_Scale_Init { const reverse_type multiply_shift_for_scale; const reverse_type division_shift_for_bucket; - Reverse_Map_Scale_Init(forward_map_type forward_map_, - reverse_map_type reverse_xadj_, - reverse_type multiply_shift_for_scale_, - reverse_type division_shift_for_bucket_) + Reverse_Map_Scale_Init(forward_map_type forward_map_, reverse_map_type reverse_xadj_, + reverse_type multiply_shift_for_scale_, reverse_type division_shift_for_bucket_) : forward_map(forward_map_), reverse_map_xadj(reverse_xadj_), multiply_shift_for_scale(multiply_shift_for_scale_), @@ -586,8 +493,7 @@ struct Reverse_Map_Scale_Init { KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type fm = forward_map[ii]; fm = fm << multiply_shift_for_scale; fm += ii >> division_shift_for_bucket; @@ -606,10 +512,8 @@ struct Fill_Reverse_Scale_Map { const reverse_type multiply_shift_for_scale; const reverse_type division_shift_for_bucket; - Fill_Reverse_Scale_Map(forward_map_type forward_map_, - reverse_map_type reverse_map_xadj_, - reverse_map_type reverse_map_adj_, - reverse_type multiply_shift_for_scale_, + Fill_Reverse_Scale_Map(forward_map_type forward_map_, reverse_map_type reverse_map_xadj_, + reverse_map_type reverse_map_adj_, reverse_type multiply_shift_for_scale_, reverse_type division_shift_for_bucket_) : forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), @@ -619,15 +523,13 @@ struct Fill_Reverse_Scale_Map { KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; forward_type fm = forward_map[ii]; fm = fm << multiply_shift_for_scale; fm += ii >> division_shift_for_bucket; - const reverse_type future_index = Kokkos::atomic_fetch_add( - &(reverse_map_xadj(fm - 1)), atomic_incr_type(1)); - reverse_map_adj(future_index) = ii; + const reverse_type future_index = Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm - 1)), atomic_incr_type(1)); + reverse_map_adj(future_index) = ii; } }; @@ -636,8 +538,7 @@ struct StridedCopy { const from_view_t from; to_view_t to; const size_t stride; - StridedCopy(const from_view_t from_, to_view_t to_, size_t stride_) - : from(from_), to(to_), stride(stride_) {} + StridedCopy(const from_view_t from_, to_view_t to_, size_t stride_) : from(from_), to(to_), stride(stride_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &ii) const { @@ -665,18 +566,14 @@ struct StridedCopy { * values of reverse maps. Its size will be num_forward_elements. * */ -template -void create_reverse_map( - MyExecSpace my_exec_space, - const typename reverse_array_type::value_type - &num_forward_elements, // num_vertices - const typename forward_array_type::value_type - &num_reverse_elements, // num_colors +template +void create_reverse_map(MyExecSpace my_exec_space, + const typename reverse_array_type::value_type &num_forward_elements, // num_vertices + const typename forward_array_type::value_type &num_reverse_elements, // num_colors - const forward_array_type &forward_map, // vertex to colors - reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj) { // colros to vertex adj + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { // colros to vertex adj typedef typename reverse_array_type::value_type lno_t; typedef typename forward_array_type::value_type reverse_lno_t; @@ -685,110 +582,84 @@ void create_reverse_map( typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = - reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), - num_reverse_elements + 1); - reverse_map_adj = reverse_array_type( - Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, - "REVERSE_ADJ"), - num_forward_elements); + reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), num_reverse_elements + 1); + reverse_map_adj = reverse_array_type(Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "REVERSE_ADJ"), + num_forward_elements); if (num_reverse_elements < MINIMUM_TO_ATOMIC) { - const lno_t scale_size = 1024; - const lno_t multiply_shift_for_scale = 10; - const lno_t division_shift_for_bucket = - lno_t(ceil(log(double(num_forward_elements) / scale_size) / log(2))); + const lno_t scale_size = 1024; + const lno_t multiply_shift_for_scale = 10; + const lno_t division_shift_for_bucket = lno_t(ceil(log(double(num_forward_elements) / scale_size) / log(2))); // const lno_t bucket_range_size = pow(2, division_shift_for_bucket); // coloring indices are base-1. we end up using not using element 1. - const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) - << multiply_shift_for_scale; + const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) << multiply_shift_for_scale; - reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), - tmp_reverse_size + 1); + reverse_array_type tmp_color_xadj(Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), tmp_reverse_size + 1); Reverse_Map_Scale_Init rmi( - forward_map, tmp_color_xadj, multiply_shift_for_scale, - division_shift_for_bucket); + forward_map, tmp_color_xadj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapScaleInit", - range_policy_t(my_exec_space, 0, num_forward_elements), - rmi); + range_policy_t(my_exec_space, 0, num_forward_elements), rmi); my_exec_space.fence(); - inclusive_parallel_prefix_sum( - my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); + inclusive_parallel_prefix_sum(my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); my_exec_space.fence(); Kokkos::parallel_for( - "KokkosKernels::Common::StridedCopy", - range_policy_t(my_exec_space, 0, num_reverse_elements + 1), - StridedCopy( - tmp_color_xadj, reverse_map_xadj, scale_size)); + "KokkosKernels::Common::StridedCopy", range_policy_t(my_exec_space, 0, num_reverse_elements + 1), + StridedCopy(tmp_color_xadj, reverse_map_xadj, scale_size)); my_exec_space.fence(); Fill_Reverse_Scale_Map frm( - forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, - division_shift_for_bucket); + forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - range_policy_t(my_exec_space, 0, num_forward_elements), - frm); + range_policy_t(my_exec_space, 0, num_forward_elements), frm); my_exec_space.fence(); } else // atomic implementation. { reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, - "TMP_REVERSE_XADJ"), - num_reverse_elements + 1); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"), num_reverse_elements + 1); - Reverse_Map_Init rmi( - forward_map, reverse_map_xadj); + Reverse_Map_Init rmi(forward_map, reverse_map_xadj); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapInit", - range_policy_t(my_exec_space, 0, num_forward_elements), - rmi); + range_policy_t(my_exec_space, 0, num_forward_elements), rmi); my_exec_space.fence(); // print_1Dview(reverse_map_xadj); - inclusive_parallel_prefix_sum( - my_exec_space, num_reverse_elements + 1, reverse_map_xadj); + inclusive_parallel_prefix_sum(my_exec_space, num_reverse_elements + 1, + reverse_map_xadj); Kokkos::deep_copy(my_exec_space, tmp_color_xadj, reverse_map_xadj); my_exec_space.fence(); - Fill_Reverse_Map frm( - forward_map, tmp_color_xadj, reverse_map_adj); + Fill_Reverse_Map frm(forward_map, tmp_color_xadj, reverse_map_adj); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - range_policy_t(my_exec_space, 0, num_forward_elements), - frm); + range_policy_t(my_exec_space, 0, num_forward_elements), frm); my_exec_space.fence(); } } template -void create_reverse_map( - const typename reverse_array_type::value_type - &num_forward_elements, // num_vertices - const typename forward_array_type::value_type - &num_reverse_elements, // num_colors - - const forward_array_type &forward_map, // vertex to colors - reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj) { +void create_reverse_map(const typename reverse_array_type::value_type &num_forward_elements, // num_vertices + const typename forward_array_type::value_type &num_reverse_elements, // num_colors + + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { MyExecSpace my_exec_space; - return create_reverse_map(my_exec_space, num_forward_elements, - num_reverse_elements, forward_map, reverse_map_xadj, + return create_reverse_map(my_exec_space, num_forward_elements, num_reverse_elements, forward_map, reverse_map_xadj, reverse_map_adj); } -template +template struct PermuteVector { typedef typename idx_array_type::value_type idx; value_array_type old_vector; out_value_array_type new_vector; idx_array_type old_to_new_mapping; idx mapping_size; - PermuteVector(value_array_type old_vector_, out_value_array_type new_vector_, - idx_array_type old_to_new_mapping_) + PermuteVector(value_array_type old_vector_, out_value_array_type new_vector_, idx_array_type old_to_new_mapping_) : old_vector(old_vector_), new_vector(new_vector_), old_to_new_mapping(old_to_new_mapping_), @@ -804,34 +675,24 @@ struct PermuteVector { } }; -template -void permute_vector(MyExecSpace my_exec_space, - typename idx_array_type::value_type num_elements, - idx_array_type &old_to_new_index_map, - value_array_type &old_vector, +template +void permute_vector(MyExecSpace my_exec_space, typename idx_array_type::value_type num_elements, + idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { using range_policy_t = Kokkos::RangePolicy; - Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", - range_policy_t(my_exec_space, 0, num_elements), - PermuteVector( - old_vector, new_vector, old_to_new_index_map)); + Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", range_policy_t(my_exec_space, 0, num_elements), + PermuteVector(old_vector, new_vector, + old_to_new_index_map)); } -template -void permute_vector(typename idx_array_type::value_type num_elements, - idx_array_type &old_to_new_index_map, - value_array_type &old_vector, - out_value_array_type &new_vector) { - permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, - new_vector); +template +void permute_vector(typename idx_array_type::value_type num_elements, idx_array_type &old_to_new_index_map, + value_array_type &old_vector, out_value_array_type &new_vector) { + permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, new_vector); } -template +template struct PermuteBlockVector { typedef typename idx_array_type::value_type idx; int block_size; @@ -839,8 +700,7 @@ struct PermuteBlockVector { out_value_array_type new_vector; idx_array_type old_to_new_mapping; idx mapping_size; - PermuteBlockVector(int block_size_, value_array_type old_vector_, - out_value_array_type new_vector_, + PermuteBlockVector(int block_size_, value_array_type old_vector_, out_value_array_type new_vector_, idx_array_type old_to_new_mapping_) : block_size(block_size_), old_vector(old_vector_), @@ -854,55 +714,42 @@ struct PermuteBlockVector { if (ii < mapping_size) mapping = old_to_new_mapping[ii]; for (idx j = 0; j < static_cast(new_vector.extent(1)); j++) { for (int i = 0; i < block_size; ++i) { - new_vector.access(mapping * block_size + i, j) = - old_vector.access(ii * block_size + i, j); + new_vector.access(mapping * block_size + i, j) = old_vector.access(ii * block_size + i, j); } } } }; -template -void permute_block_vector(MyExecSpace my_exec_space, - typename idx_array_type::value_type num_elements, - int block_size, idx_array_type &old_to_new_index_map, - value_array_type &old_vector, +template +void permute_block_vector(MyExecSpace my_exec_space, typename idx_array_type::value_type num_elements, int block_size, + idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { using range_policy_t = Kokkos::RangePolicy; - Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", - range_policy_t(my_exec_space, 0, num_elements), - PermuteBlockVector(block_size, old_vector, new_vector, - old_to_new_index_map)); + Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", range_policy_t(my_exec_space, 0, num_elements), + PermuteBlockVector( + block_size, old_vector, new_vector, old_to_new_index_map)); } -template -void permute_block_vector(typename idx_array_type::value_type num_elements, - int block_size, idx_array_type &old_to_new_index_map, - value_array_type &old_vector, +template +void permute_block_vector(typename idx_array_type::value_type num_elements, int block_size, + idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - permute_block_vector(MyExecSpace(), num_elements, block_size, - old_to_new_index_map, old_vector, new_vector); + permute_block_vector(MyExecSpace(), num_elements, block_size, old_to_new_index_map, old_vector, new_vector); } // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. template -void zero_vector(ExecSpaceIn &exec_space_in, - typename value_array_type::value_type /* num_elements */, +void zero_vector(ExecSpaceIn &exec_space_in, typename value_array_type::value_type /* num_elements */, value_array_type &vector) { typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(exec_space_in, vector, - Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(exec_space_in, vector, Kokkos::ArithTraits::zero()); exec_space_in.fence(); } template -void zero_vector(typename value_array_type::value_type /* num_elements */, - value_array_type &vector) { +void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { using ne_tmp_t = typename value_array_type::value_type; ne_tmp_t ne_tmp = ne_tmp_t(0); MyExecSpace my_exec_space; @@ -915,21 +762,15 @@ struct MarkDuplicateSortedKeyValuePairs { v2 vals; v3 prefix_sum; typename v1::size_type overall_size; - MarkDuplicateSortedKeyValuePairs(v1 keys_, v2 vals_, v3 prefix_sum_, - typename v1::size_type overall_size_) - : keys(keys_), - vals(vals_), - prefix_sum(prefix_sum_), - overall_size(overall_size_) {} + MarkDuplicateSortedKeyValuePairs(v1 keys_, v2 vals_, v3 prefix_sum_, typename v1::size_type overall_size_) + : keys(keys_), vals(vals_), prefix_sum(prefix_sum_), overall_size(overall_size_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, typename v3::value_type &num_result) const { typename v1::value_type my_key = keys(i); typename v2::value_type my_val = vals(i); - if ((my_key != 0 && my_val != 0) && - ((i + 1 >= overall_size) || - (my_key != keys(i + 1) || my_val != vals(i + 1)))) { + if ((my_key != 0 && my_val != 0) && ((i + 1 >= overall_size) || (my_key != keys(i + 1) || my_val != vals(i + 1)))) { prefix_sum(i) = 1; num_result += 1; } @@ -944,9 +785,7 @@ struct FillSymmetricCSR { typename v3::size_type array_size; v4 out_xadj; v5 out_adj; - FillSymmetricCSR(v1 keys_, v2 vals_, v3 prefix_sum_, - typename v3::size_type array_size_, v4 out_xadj_, - v5 out_adj_) + FillSymmetricCSR(v1 keys_, v2 vals_, v3 prefix_sum_, typename v3::size_type array_size_, v4 out_xadj_, v5 out_adj_) : keys(keys_), vals(vals_), prefix_sum(prefix_sum_), @@ -978,12 +817,10 @@ struct FillSymmetricCSR { } }; -template -void symmetrize_and_get_lower_diagonal_edge_list( - typename in_lno_nnz_view_t::value_type num_rows_to_symmetrize, - in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_nnz_view_t &sym_srcs, - out_lno_nnz_view_t &sym_dsts_) { +template +void symmetrize_and_get_lower_diagonal_edge_list(typename in_lno_nnz_view_t::value_type num_rows_to_symmetrize, + in_lno_row_view_t xadj, in_lno_nnz_view_t adj, + out_lno_nnz_view_t &sym_srcs, out_lno_nnz_view_t &sym_dsts_) { typedef typename in_lno_row_view_t::non_const_value_type idx; idx nnz = adj.extent(0); @@ -997,8 +834,7 @@ void symmetrize_and_get_lower_diagonal_edge_list( // typedef Kokkos::RangePolicy my_exec_space; // TODO: Should change this to temporary memory space? - typedef Kokkos::UnorderedMap, void, MyExecSpace> - hashmap_t; + typedef Kokkos::UnorderedMap, void, MyExecSpace> hashmap_t; out_lno_nnz_view_t pre_pps_("pre_pps", num_rows_to_symmetrize + 1); @@ -1007,31 +843,26 @@ void symmetrize_and_get_lower_diagonal_edge_list( hashmap_t umap(nnz); umap.clear(); umap.end_erase(); - FillSymmetricLowerEdgesHashMap + FillSymmetricLowerEdgesHashMap fse(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); // std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:" // << vector_size << " tsm:" << teamSizeMax<< std::endl; - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S0", pol, - fse /*, num_symmetric_edges*/); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S0", pol, + fse /*, num_symmetric_edges*/); MyExecSpace().fence(); } if (num_rows_to_symmetrize > 0) - exclusive_parallel_prefix_sum( - num_rows_to_symmetrize + 1, pre_pps_); + exclusive_parallel_prefix_sum(num_rows_to_symmetrize + 1, pre_pps_); MyExecSpace().fence(); auto d_sym_edge_size = Kokkos::subview(pre_pps_, num_rows_to_symmetrize); @@ -1046,45 +877,33 @@ void symmetrize_and_get_lower_diagonal_edge_list( num_symmetric_edges = h_sym_edge_size(h_sym_edge_size.extent(0) - 1); */ - sym_srcs = out_lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_srcs"), - num_symmetric_edges); - sym_dsts_ = out_lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_dsts_"), - num_symmetric_edges); + sym_srcs = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_srcs"), num_symmetric_edges); + sym_dsts_ = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_dsts_"), num_symmetric_edges); MyExecSpace().fence(); { hashmap_t umap(nnz); - FillSymmetricEdgeList_HashMap - FSCH(num_rows_to_symmetrize, xadj, adj, umap, sym_srcs, sym_dsts_, - pre_pps_); + FSCH(num_rows_to_symmetrize, xadj, adj, umap, sym_srcs, sym_dsts_, pre_pps_); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, - FSCH); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, FSCH); MyExecSpace().fence(); } } -template -void symmetrize_graph_symbolic_hashmap( - typename in_lno_row_view_t::value_type num_rows_to_symmetrize, - in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_row_view_t &sym_xadj, - out_lno_nnz_view_t &sym_adj) { +template +void symmetrize_graph_symbolic_hashmap(typename in_lno_row_view_t::value_type num_rows_to_symmetrize, + in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_row_view_t &sym_xadj, + out_lno_nnz_view_t &sym_adj) { typedef typename in_lno_row_view_t::non_const_value_type idx; idx nnz = adj.extent(0); @@ -1098,8 +917,7 @@ void symmetrize_graph_symbolic_hashmap( // typedef Kokkos::RangePolicy my_exec_space; // TODO: Should change this to temporary memory space? - typedef Kokkos::UnorderedMap, void, MyExecSpace> - hashmap_t; + typedef Kokkos::UnorderedMap, void, MyExecSpace> hashmap_t; out_lno_row_view_t pre_pps_("pre_pps", num_rows_to_symmetrize + 1); @@ -1108,66 +926,53 @@ void symmetrize_graph_symbolic_hashmap( hashmap_t umap(nnz); umap.clear(); umap.end_erase(); - FillSymmetricEdgesHashMap - fse(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_); + FillSymmetricEdgesHashMap fse( + num_rows_to_symmetrize, xadj, adj, umap, pre_pps_); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(fse, vector_size); - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", pol, - fse /*, num_symmetric_edges*/); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", pol, + fse /*, num_symmetric_edges*/); MyExecSpace().fence(); } if (num_rows_to_symmetrize > 0) - exclusive_parallel_prefix_sum( - num_rows_to_symmetrize + 1, pre_pps_); + exclusive_parallel_prefix_sum(num_rows_to_symmetrize + 1, pre_pps_); MyExecSpace().fence(); // out_lno_row_view_t d_sym_edge_size = Kokkos::subview(pre_pps_, // num_rows_to_symmetrize, num_rows_to_symmetrize ); - typename out_lno_row_view_t::HostMirror h_sym_edge_size = - Kokkos::create_mirror_view(pre_pps_); + typename out_lno_row_view_t::HostMirror h_sym_edge_size = Kokkos::create_mirror_view(pre_pps_); Kokkos::deep_copy(h_sym_edge_size, pre_pps_); num_symmetric_edges = h_sym_edge_size(h_sym_edge_size.extent(0) - 1); - sym_adj = out_lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_adj"), - num_symmetric_edges); + sym_adj = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_adj"), num_symmetric_edges); MyExecSpace().fence(); - sym_xadj = out_lno_row_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_xadj"), - num_rows_to_symmetrize + 1); + sym_xadj = + out_lno_row_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_xadj"), num_rows_to_symmetrize + 1); Kokkos::deep_copy(sym_xadj, pre_pps_); { hashmap_t umap(nnz); - FillSymmetricCRS_HashMap FSCH(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_, sym_adj); int teamSizeMax = 0; int vector_size = 0; - get_suggested_vector_size(vector_size, xadj.extent(0) - 1, - nnz); + get_suggested_vector_size(vector_size, xadj.extent(0) - 1, nnz); teamSizeMax = get_suggested_team_size(FSCH, vector_size); - team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size); - Kokkos::parallel_for( - "KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", pol, FSCH); + team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size); + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", pol, FSCH); MyExecSpace().fence(); } @@ -1192,44 +997,36 @@ struct CopyView { template void copy_view(size_t num_elements, from_vector from, to_vector to) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::CopyView", - my_exec_space(0, num_elements), + Kokkos::parallel_for("KokkosKernels::Common::CopyView", my_exec_space(0, num_elements), CopyView(from, to)); } template -void safe_device_to_host_deep_copy(size_t num_elements, from_view from, - typename from_view::HostMirror to) { +void safe_device_to_host_deep_copy(size_t num_elements, from_view from, typename from_view::HostMirror to) { typedef typename from_view::value_type scalar_t; typedef typename from_view::device_type device_t; typedef Kokkos::View unstrided_from_view_t; unstrided_from_view_t unstrided_from("unstrided", num_elements); - copy_view(num_elements, from, - unstrided_from); + copy_view(num_elements, from, unstrided_from); Kokkos::fence(); typedef typename unstrided_from_view_t::HostMirror host_unstrided_from_view_t; - host_unstrided_from_view_t h_unstrided_from = - Kokkos::create_mirror_view(unstrided_from); + host_unstrided_from_view_t h_unstrided_from = Kokkos::create_mirror_view(unstrided_from); Kokkos::deep_copy(h_unstrided_from, unstrided_from); Kokkos::fence(); copy_view( - num_elements, h_unstrided_from, to); + typename host_unstrided_from_view_t::device_type::execution_space>(num_elements, h_unstrided_from, to); Kokkos::fence(); } template -void safe_host_to_device_deep_copy(size_t num_elements, - typename to_view::HostMirror from, - to_view to) { +void safe_host_to_device_deep_copy(size_t num_elements, typename to_view::HostMirror from, to_view to) { typedef typename to_view::value_type scalar_t; typedef typename to_view::device_type device_t; @@ -1241,17 +1038,15 @@ void safe_host_to_device_deep_copy(size_t num_elements, host_unstrided_view_t host_unstrided_from("unstrided", num_elements); device_unstrided_view_t device_unstrided_to("unstrided", num_elements); - copy_view(num_elements, from, - host_unstrided_from); + copy_view( + num_elements, from, host_unstrided_from); Kokkos::fence(); Kokkos::deep_copy(device_unstrided_to, host_unstrided_from); Kokkos::fence(); - copy_view(num_elements, - device_unstrided_to, to); + copy_view(num_elements, device_unstrided_to, + to); Kokkos::fence(); } @@ -1260,12 +1055,9 @@ template struct ReduceSumFunctor { view_type view_to_reduce; - ReduceSumFunctor(view_type view_to_reduce_) - : view_to_reduce(view_to_reduce_) {} + ReduceSumFunctor(view_type view_to_reduce_) : view_to_reduce(view_to_reduce_) {} - void operator()( - const size_t &i, - typename view_type::non_const_value_type &sum_reduction) const { + void operator()(const size_t &i, typename view_type::non_const_value_type &sum_reduction) const { sum_reduction += view_to_reduce(i); } }; @@ -1274,16 +1066,14 @@ template void view_reduce_sum(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &sum_reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceSum", my_exec_space(0, num_elements), - ReduceSumFunctor(view_to_reduce), sum_reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ViewReduceSum", my_exec_space(0, num_elements), + ReduceSumFunctor(view_to_reduce), sum_reduction); } template void view_reduce_max(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &max_reduction) { - kk_view_reduce_max(num_elements, view_to_reduce, - max_reduction); + kk_view_reduce_max(num_elements, view_to_reduce, max_reduction); } template @@ -1319,28 +1109,18 @@ struct ReduceRowSizeFunctor { // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, - const size_t num_rows, - const size_type *rowmap_view_begins, - const size_type *rowmap_view_ends, - size_type &max_row_size) { +void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, const size_t num_rows, const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, size_type &max_row_size) { typedef Kokkos::RangePolicy range_policy_t; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxRowSize", - range_policy_t(my_exec_space, 0, num_rows), - ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), - max_row_size); + Kokkos::parallel_reduce("KokkosKernels::Common::ViewReduceMaxRowSize", range_policy_t(my_exec_space, 0, num_rows), + ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), max_row_size); } // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(const size_t num_rows, - const size_type *rowmap_view_begins, - const size_type *rowmap_view_ends, - size_type &max_row_size) { - return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, - rowmap_view_begins, rowmap_view_ends, - max_row_size); +void kk_view_reduce_max_row_size(const size_t num_rows, const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, size_type &max_row_size) { + return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, rowmap_view_begins, rowmap_view_ends, max_row_size); } template @@ -1348,8 +1128,7 @@ struct ReduceMaxRowFunctor { view_type rowmap_view; typedef typename view_type::non_const_value_type value_type; const value_type min_val; - ReduceMaxRowFunctor(view_type rowmap_view_) - : rowmap_view(rowmap_view_), min_val(0) {} + ReduceMaxRowFunctor(view_type rowmap_view_) : rowmap_view(rowmap_view_), min_val(0) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, value_type &max_reduction) const { @@ -1377,13 +1156,11 @@ struct ReduceMaxRowFunctor { // view has num_rows+1 elements. template -void view_reduce_maxsizerow( - size_t num_rows, view_type rowmap_view, - typename view_type::non_const_value_type &max_reduction) { +void view_reduce_maxsizerow(size_t num_rows, view_type rowmap_view, + typename view_type::non_const_value_type &max_reduction) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxSizeRow", my_exec_space(0, num_rows), - ReduceMaxRowFunctor(rowmap_view), max_reduction); + Kokkos::parallel_reduce("KokkosKernels::Common::ViewReduceMaxSizeRow", my_exec_space(0, num_rows), + ReduceMaxRowFunctor(rowmap_view), max_reduction); } template @@ -1391,8 +1168,7 @@ struct IsEqualFunctor { view_type1 view1; view_type2 view2; - IsEqualFunctor(view_type1 view1_, view_type2 view2_) - : view1(view1_), view2(view2_) {} + IsEqualFunctor(view_type1 view1_, view_type2 view2_) : view1(view1_), view2(view2_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, int &is_equal) const { @@ -1412,9 +1188,8 @@ template bool isSame(size_t num_elements, view_type1 view1, view_type2 view2) { typedef Kokkos::RangePolicy my_exec_space; int issame = 1; - Kokkos::parallel_reduce( - "KokkosKernels::Common::isSame", my_exec_space(0, num_elements), - IsEqualFunctor(view1, view2), issame); + Kokkos::parallel_reduce("KokkosKernels::Common::isSame", my_exec_space(0, num_elements), + IsEqualFunctor(view1, view2), issame); MyExecSpace().fence(); return issame; } @@ -1427,14 +1202,10 @@ struct MaxHeap { size_type current_size; MaxHeap(a_view_t heap_keys_, b_view_t heap_values_, size_type max_size_) - : heap_keys(heap_keys_), - heap_values(heap_values_), - max_size(max_size_), - current_size(0) {} + : heap_keys(heap_keys_), heap_values(heap_values_), max_size(max_size_), current_size(0) {} KOKKOS_INLINE_FUNCTION - void insert(typename a_view_t::value_type &key, - typename b_view_t::value_type &val) { + void insert(typename a_view_t::value_type &key, typename b_view_t::value_type &val) { for (size_type i = 0; i < current_size; ++i) { if (key == heap_keys(i)) { heap_values(i) = heap_values(i) & val; @@ -1459,8 +1230,7 @@ struct InitScalar { size_type team_row_chunk_size; nnz_lno_t init_val; - InitScalar(size_type num_elements_, in_view_t view_to_init_, - size_type chunk_size_, nnz_lno_t init_val_) + InitScalar(size_type num_elements_, in_view_t view_to_init_, size_type chunk_size_, nnz_lno_t init_val_) : num_elements(num_elements_), view_to_init(view_to_init_), team_row_chunk_size(chunk_size_), @@ -1471,20 +1241,16 @@ struct InitScalar { // const nnz_lno_t row_index = teamMember.league_rank() * // team_row_chunk_size; - const nnz_lno_t team_row_begin = - teamMember.league_rank() * team_row_chunk_size; - const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN( - team_row_begin + team_row_chunk_size, num_elements); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), - [&](const nnz_lno_t &row_ind) { view_to_init[row_ind] = init_val; }); + const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size; + const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, num_elements); + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_ind) { view_to_init[row_ind] = init_val; }); } }; template -void init_view_withscalar( - typename in_row_view_t::size_type num_elements, in_row_view_t arr, - typename in_row_view_t::size_type team_size, - typename in_row_view_t::non_const_value_type init_val) { +void init_view_withscalar(typename in_row_view_t::size_type num_elements, in_row_view_t arr, + typename in_row_view_t::size_type team_size, + typename in_row_view_t::non_const_value_type init_val) { typename in_row_view_t::size_type chunk_size = num_elements / team_size; typedef InitScalar InitScalar_t; InitScalar_t tm(num_elements, arr, chunk_size, init_val); @@ -1492,9 +1258,8 @@ void init_view_withscalar( int vector_size = 1; Kokkos::Timer timer1; - Kokkos::parallel_for( - "KokkosKernels::Common::InitViewWithScalar", - tcp_t(num_elements / chunk_size + 1, team_size, vector_size), tm); + Kokkos::parallel_for("KokkosKernels::Common::InitViewWithScalar", + tcp_t(num_elements / chunk_size + 1, team_size, vector_size), tm); MyExecSpace().fence(); } @@ -1504,8 +1269,7 @@ struct array_sum_reduce { using ValueType = array_sum_reduce; // Workaround for https://github.com/kokkos/kokkos/issues/5860 static constexpr int N_internal = - ((N == 3 || N == 5 || N == 7) && - std::is_same::value && + ((N == 3 || N == 5 || N == 7) && std::is_same::value && sizeof(Kokkos::Experimental::half_t) == 2) ? (N + 1) : N; @@ -1533,11 +1297,9 @@ KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr *p) { const std::uintptr_t ptrVal = reinterpret_cast(p); // ptrVal + (align - 1) lands inside the next valid aligned scalar_t, // and the mask produces the start of that scalar_t. - const std::uintptr_t ptrValNew = - (ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)); - return reinterpret_cast( - reinterpret_cast(const_cast *>(p)) + - (ptrValNew - ptrVal)); + const std::uintptr_t ptrValNew = (ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)); + return reinterpret_cast(reinterpret_cast(const_cast *>(p)) + + (ptrValNew - ptrVal)); } } // namespace Impl diff --git a/common/src/KokkosKernels_VectorUtils.hpp b/common/src/KokkosKernels_VectorUtils.hpp index f0c09a7e9f..d20d298956 100644 --- a/common/src/KokkosKernels_VectorUtils.hpp +++ b/common/src/KokkosKernels_VectorUtils.hpp @@ -22,15 +22,13 @@ namespace KokkosKernels { namespace Impl { -template +template struct A_times_X_plus_B { out_array_t out_view; in_array_t in_view; const scalar_1 a; const scalar_2 b; - A_times_X_plus_B(out_array_t out_view_, in_array_t in_view_, scalar_1 a_, - scalar_2 b_) + A_times_X_plus_B(out_array_t out_view_, in_array_t in_view_, scalar_1 a_, scalar_2 b_) : out_view(out_view_), in_view(in_view_), a(a_), b(b_) {} KOKKOS_INLINE_FUNCTION @@ -47,9 +45,7 @@ struct ModularView { : out_view(out_view_), in_view(in_view_), modular_constant(mod_factor_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t ii) const { - out_view(ii) = in_view(ii) % modular_constant; - } + void operator()(const size_t ii) const { out_view(ii) = in_view(ii) % modular_constant; } }; template @@ -72,16 +68,12 @@ struct CopyVectorFunctor { * \param a: scalar for multiplication * \param b: scalar for addition */ -template -inline void kk_a_times_x_plus_b(typename in_array_t::value_type num_elements, - out_array_t out_arr, in_array_t in_arr, +template +inline void kk_a_times_x_plus_b(typename in_array_t::value_type num_elements, out_array_t out_arr, in_array_t in_arr, scalar_1 a, scalar_2 b) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( - "KokkosKernels::Common::ATimesXPlusB", my_exec_space(0, num_elements), - A_times_X_plus_B( - out_arr, in_arr, a, b)); + Kokkos::parallel_for("KokkosKernels::Common::ATimesXPlusB", my_exec_space(0, num_elements), + A_times_X_plus_B(out_arr, in_arr, a, b)); } /** @@ -92,20 +84,17 @@ inline void kk_a_times_x_plus_b(typename in_array_t::value_type num_elements, * applied. */ template -inline void kk_modular_view(typename in_array_type::value_type num_elements, - out_array_type out_arr, in_array_type in_arr, - int mod_factor_) { +inline void kk_modular_view(typename in_array_type::value_type num_elements, out_array_type out_arr, + in_array_type in_arr, int mod_factor_) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( - "KokkosKernels::Common::ModularView", my_exec_space(0, num_elements), - ModularView(out_arr, in_arr, mod_factor_)); + Kokkos::parallel_for("KokkosKernels::Common::ModularView", my_exec_space(0, num_elements), + ModularView(out_arr, in_arr, mod_factor_)); } template void kk_copy_vector(size_t num_elements, from_vector from, to_vector to) { typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::CopyVector", - my_exec_space(0, num_elements), + Kokkos::parallel_for("KokkosKernels::Common::CopyVector", my_exec_space(0, num_elements), CopyVectorFunctor(from, to)); } } // namespace Impl diff --git a/common/src/KokkosKernels_helpers.hpp b/common/src/KokkosKernels_helpers.hpp index 1b725f2f5c..cea3a8a061 100644 --- a/common/src/KokkosKernels_helpers.hpp +++ b/common/src/KokkosKernels_helpers.hpp @@ -16,7 +16,7 @@ #ifndef KOKKOSKERNELS_HELPERS_HPP_ #define KOKKOSKERNELS_HELPERS_HPP_ -#include "KokkosKernels_config.h" // KOKKOSKERNELS_INST_LAYOUTLEFT, KOKKOSKERNELS_INST_LAYOUTRIGHT +#include "KokkosKernels_config.h" // KOKKOSKERNELS_INST_LAYOUTLEFT, KOKKOSKERNELS_INST_LAYOUTRIGHT #include "KokkosKernels_default_types.hpp" // default_layout #include @@ -29,49 +29,43 @@ namespace Impl { // Used to reduce number of code instantiations. template struct GetUnifiedLayoutPreferring { - using array_layout = typename std::conditional< - ((ViewType::rank == 1) && !std::is_same_v) || - (ViewType::rank == 0), - PreferredLayoutType, typename ViewType::array_layout>::type; + using array_layout = + typename std::conditional<((ViewType::rank == 1) && + !std::is_same_v) || + (ViewType::rank == 0), + PreferredLayoutType, typename ViewType::array_layout>::type; }; template struct GetUnifiedLayout { - using array_layout = - typename GetUnifiedLayoutPreferring::array_layout; + using array_layout = typename GetUnifiedLayoutPreferring::array_layout; }; -template ::value> +template ::value> struct GetUnifiedScalarViewType { typedef typename TX::non_const_value_type type; }; template struct GetUnifiedScalarViewType { - typedef Kokkos::View::array_layout, - typename T::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View< + typename T::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename T::device_type, Kokkos::MemoryTraits > type; }; template struct GetUnifiedScalarViewType { - typedef Kokkos::View::array_layout, - typename T::device_type, - Kokkos::MemoryTraits > + typedef Kokkos::View< + typename T::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout, + typename T::device_type, Kokkos::MemoryTraits > type; }; template -struct are_integral : std::bool_constant<((std::is_integral_v || - std::is_enum_v)&&...)> {}; +struct are_integral : std::bool_constant<((std::is_integral_v || std::is_enum_v)&&...)> {}; template inline constexpr bool are_integral_v = are_integral::value; diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 415189be93..25089613d4 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -48,8 +48,7 @@ namespace { // anonymous /// /// Use intPowSigned or intPowUnsigned for general y. template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x, - const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x, const IntType y) { // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2 IntType prod = x; IntType y_cur = 1; @@ -120,10 +119,8 @@ struct integer_abs { /// result of this function is undefined. However, this function will /// not throw an exception in that case. template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if::is_signed, IntType>::type +intPowSigned(const IntType x, const IntType y) { // It's not entirely clear what to return if x and y are both zero. // In the case of floating-point numbers, 0^0 is NaN. Here, though, // I think it's safe to return 0. @@ -143,10 +140,8 @@ KOKKOS_FORCEINLINE_FUNCTION return intPowImpl(x, y); } template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if::is_signed, IntType>::type +intPowSigned(const IntType x, const IntType y) { // It's not entirely clear what to return if x and y are both zero. // In the case of floating-point numbers, 0^0 is NaN. Here, though, // I think it's safe to return 0. @@ -166,8 +161,7 @@ KOKKOS_FORCEINLINE_FUNCTION /// result of this function is undefined. However, this function will /// not throw an exception in that case. template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, - const IntType y) { +KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, const IntType y) { // It's not entirely clear what to return if x and y are both zero. // In the case of floating-point numbers, 0^0 is NaN. Here, though, // I think it's safe to return 0. @@ -196,370 +190,229 @@ KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, namespace Kokkos { // Macro to automate the wrapping of Kokkos Mathematical Functions -#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ - static FUNC_QUAL val_type zero() { return static_cast(0); } \ - static FUNC_QUAL val_type one() { return static_cast(1); } \ - static FUNC_QUAL val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static FUNC_QUAL val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static FUNC_QUAL val_type infinity() { \ - return Kokkos::Experimental::infinity::value; \ - } \ - static FUNC_QUAL val_type nan() { \ - return Kokkos::Experimental::quiet_NaN::value; \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return Kokkos::Experimental::epsilon::value; \ - } \ - static FUNC_QUAL mag_type sfmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int base() { \ - return Kokkos::Experimental::radix::value; \ - } \ - static FUNC_QUAL mag_type prec() { \ - return epsilon() * static_cast(base()); \ - } \ - static FUNC_QUAL int t() { \ - return Kokkos::Experimental::digits::value; \ - } \ - static FUNC_QUAL mag_type rnd() { return one(); } \ - static FUNC_QUAL int emin() { \ - return Kokkos::Experimental::min_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int emax() { \ - return Kokkos::Experimental::max_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmax() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - \ - static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ - static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); } \ - static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ - static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ - static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ - static FUNC_QUAL val_type conj(const val_type x) { return x; } \ - static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ - static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ - static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ - static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ - static FUNC_QUAL val_type log10(const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ - static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ - static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ - static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ - static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ - static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ - static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ - static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ - static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ - \ - static FUNC_QUAL bool isnaninf(const val_type x) { \ - return isNan(x) || isInf(x); \ - } \ - static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ - static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ +#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ + static FUNC_QUAL val_type min() { return Kokkos::Experimental::finite_min::value; } \ + static FUNC_QUAL val_type max() { return Kokkos::Experimental::finite_max::value; } \ + static FUNC_QUAL val_type infinity() { return Kokkos::Experimental::infinity::value; } \ + static FUNC_QUAL val_type nan() { return Kokkos::Experimental::quiet_NaN::value; } \ + static FUNC_QUAL mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } \ + static FUNC_QUAL mag_type sfmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int base() { return Kokkos::Experimental::radix::value; } \ + static FUNC_QUAL mag_type prec() { return epsilon() * static_cast(base()); } \ + static FUNC_QUAL int t() { return Kokkos::Experimental::digits::value; } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { return Kokkos::Experimental::min_exponent::value; } \ + static FUNC_QUAL mag_type rmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int emax() { return Kokkos::Experimental::max_exponent::value; } \ + static FUNC_QUAL mag_type rmax() { return Kokkos::Experimental::finite_max::value; } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { return Kokkos::log10(x); } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } // Macro to automate the wrapping of Kokkos Mathematical Functions -#define KOKKOSKERNELS_ARITHTRAITS_HALF_FP(FUNC_QUAL) \ - static FUNC_QUAL val_type zero() { return static_cast(0); } \ - static FUNC_QUAL val_type one() { return static_cast(1); } \ - static FUNC_QUAL val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static FUNC_QUAL val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static FUNC_QUAL val_type infinity() { \ - return Kokkos::Experimental::infinity::value; \ - } \ - static FUNC_QUAL val_type nan() { \ - return Kokkos::Experimental::quiet_NaN::value; \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return Kokkos::Experimental::epsilon::value; \ - } \ - static FUNC_QUAL mag_type sfmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int base() { \ - return Kokkos::Experimental::radix::value; \ - } \ - static FUNC_QUAL mag_type prec() { \ - return epsilon() * static_cast(base()); \ - } \ - static FUNC_QUAL int t() { \ - return Kokkos::Experimental::digits::value; \ - } \ - static FUNC_QUAL mag_type rnd() { return one(); } \ - static FUNC_QUAL int emin() { \ - return Kokkos::Experimental::min_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int emax() { \ - return Kokkos::Experimental::max_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmax() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - \ - static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ - static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ - static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ - static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ - static FUNC_QUAL val_type conj(const val_type x) { return x; } \ - static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ - static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ - static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ - static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ - static FUNC_QUAL val_type log10(const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ - static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ - static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ - static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ - static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ - static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ - static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ - static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ - static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ - \ - static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ - static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ +#define KOKKOSKERNELS_ARITHTRAITS_HALF_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ + static FUNC_QUAL val_type min() { return Kokkos::Experimental::finite_min::value; } \ + static FUNC_QUAL val_type max() { return Kokkos::Experimental::finite_max::value; } \ + static FUNC_QUAL val_type infinity() { return Kokkos::Experimental::infinity::value; } \ + static FUNC_QUAL val_type nan() { return Kokkos::Experimental::quiet_NaN::value; } \ + static FUNC_QUAL mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } \ + static FUNC_QUAL mag_type sfmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int base() { return Kokkos::Experimental::radix::value; } \ + static FUNC_QUAL mag_type prec() { return epsilon() * static_cast(base()); } \ + static FUNC_QUAL int t() { return Kokkos::Experimental::digits::value; } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { return Kokkos::Experimental::min_exponent::value; } \ + static FUNC_QUAL mag_type rmin() { return Kokkos::Experimental::norm_min::value; } \ + static FUNC_QUAL int emax() { return Kokkos::Experimental::max_exponent::value; } \ + static FUNC_QUAL mag_type rmax() { return Kokkos::Experimental::finite_max::value; } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { return Kokkos::log10(x); } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } -#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ - \ - static constexpr bool is_specialized = true; \ - static constexpr bool is_signed = true; \ - static constexpr bool is_integer = false; \ - static constexpr bool is_exact = false; \ - static constexpr bool is_complex = true; \ - static constexpr bool has_infinity = true; \ - \ - using magnitudeType = mag_type; \ - using halfPrecision = \ - ::Kokkos::complex::halfPrecision>; \ - using doublePrecision = \ - ::Kokkos::complex::doublePrecision>; \ - \ - static constexpr bool isComplex = true; \ - static constexpr bool isOrdinal = false; \ - static constexpr bool isComparable = false; \ - static constexpr bool hasMachineParameters = \ - ArithTraits::hasMachineParameters; \ - \ - static FUNC_QUAL val_type zero() { \ - return val_type(ArithTraits::zero(), \ - ArithTraits::zero()); \ - } \ - static FUNC_QUAL val_type one() { \ - return val_type(ArithTraits::one(), \ - ArithTraits::zero()); \ - } \ - static FUNC_QUAL val_type min() { \ - return val_type(ArithTraits::min(), \ - ArithTraits::min()); \ - } \ - static FUNC_QUAL val_type max() { \ - return val_type(ArithTraits::max(), \ - ArithTraits::max()); \ - } \ - static FUNC_QUAL val_type infinity() { \ - return val_type(ArithTraits::infinity(), \ - ArithTraits::infinity()); \ - } \ - static FUNC_QUAL val_type nan() { \ - return val_type(ArithTraits::nan(), \ - ArithTraits::nan()); \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return ArithTraits::epsilon(); \ - } \ - static FUNC_QUAL mag_type sfmin() { return ArithTraits::sfmin(); } \ - static FUNC_QUAL int base() { return ArithTraits::base(); } \ - static FUNC_QUAL mag_type prec() { return ArithTraits::prec(); } \ - static FUNC_QUAL int t() { return ArithTraits::t(); } \ - static FUNC_QUAL mag_type rnd() { return ArithTraits::rnd(); } \ - static FUNC_QUAL int emin() { return ArithTraits::emin(); } \ - static FUNC_QUAL mag_type rmin() { return ArithTraits::rmin(); } \ - static FUNC_QUAL int emax() { return ArithTraits::emax(); } \ - static FUNC_QUAL mag_type rmax() { return ArithTraits::rmax(); } \ - static FUNC_QUAL bool isInf(const val_type x) { \ - return ArithTraits::isInf(x.real()) || \ - ArithTraits::isInf(x.imag()); \ - } \ - static FUNC_QUAL bool isNan(const val_type x) { \ - return ArithTraits::isNan(x.real()) || \ - ArithTraits::isNan(x.imag()); \ - } \ - static FUNC_QUAL mag_type abs(const val_type x) { return ::Kokkos::abs(x); } \ - static FUNC_QUAL mag_type real(const val_type x) { return x.real(); } \ - static FUNC_QUAL mag_type imag(const val_type x) { return x.imag(); } \ - static FUNC_QUAL val_type conj(const val_type x) { \ - return ::Kokkos::conj(x); \ - } \ - static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type pow(const val_type x, const mag_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type pow(const mag_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { \ - return ::Kokkos::sqrt(x); \ - } \ - static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ - static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ - static FUNC_QUAL val_type log10(const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ - static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ - static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ - static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ - static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ - static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ - static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ - static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ - static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ - static FUNC_QUAL bool isnaninf(const val_type& x) { \ - return isNan(x) || isInf(x); \ - } \ - static FUNC_QUAL mag_type magnitude(const val_type x) { return abs(x); } \ - static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ - static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ +#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_signed = true; \ + static constexpr bool is_integer = false; \ + static constexpr bool is_exact = false; \ + static constexpr bool is_complex = true; \ + static constexpr bool has_infinity = true; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = ::Kokkos::complex::halfPrecision>; \ + using doublePrecision = ::Kokkos::complex::doublePrecision>; \ + \ + static constexpr bool isComplex = true; \ + static constexpr bool isOrdinal = false; \ + static constexpr bool isComparable = false; \ + static constexpr bool hasMachineParameters = ArithTraits::hasMachineParameters; \ + \ + static FUNC_QUAL val_type zero() { return val_type(ArithTraits::zero(), ArithTraits::zero()); } \ + static FUNC_QUAL val_type one() { return val_type(ArithTraits::one(), ArithTraits::zero()); } \ + static FUNC_QUAL val_type min() { return val_type(ArithTraits::min(), ArithTraits::min()); } \ + static FUNC_QUAL val_type max() { return val_type(ArithTraits::max(), ArithTraits::max()); } \ + static FUNC_QUAL val_type infinity() { \ + return val_type(ArithTraits::infinity(), ArithTraits::infinity()); \ + } \ + static FUNC_QUAL val_type nan() { return val_type(ArithTraits::nan(), ArithTraits::nan()); } \ + static FUNC_QUAL mag_type epsilon() { return ArithTraits::epsilon(); } \ + static FUNC_QUAL mag_type sfmin() { return ArithTraits::sfmin(); } \ + static FUNC_QUAL int base() { return ArithTraits::base(); } \ + static FUNC_QUAL mag_type prec() { return ArithTraits::prec(); } \ + static FUNC_QUAL int t() { return ArithTraits::t(); } \ + static FUNC_QUAL mag_type rnd() { return ArithTraits::rnd(); } \ + static FUNC_QUAL int emin() { return ArithTraits::emin(); } \ + static FUNC_QUAL mag_type rmin() { return ArithTraits::rmin(); } \ + static FUNC_QUAL int emax() { return ArithTraits::emax(); } \ + static FUNC_QUAL mag_type rmax() { return ArithTraits::rmax(); } \ + static FUNC_QUAL bool isInf(const val_type x) { \ + return ArithTraits::isInf(x.real()) || ArithTraits::isInf(x.imag()); \ + } \ + static FUNC_QUAL bool isNan(const val_type x) { \ + return ArithTraits::isNan(x.real()) || ArithTraits::isNan(x.imag()); \ + } \ + static FUNC_QUAL mag_type abs(const val_type x) { return ::Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return x.real(); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return x.imag(); } \ + static FUNC_QUAL val_type conj(const val_type x) { return ::Kokkos::conj(x); } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type pow(const val_type x, const mag_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type pow(const mag_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return ::Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { return Kokkos::log10(x); } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + static FUNC_QUAL bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } \ + static FUNC_QUAL mag_type magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsAbs(const val_type x) { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsAbs(const val_type x) { return Kokkos::abs(x); } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsAbs(const val_type x) { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsAbs(const val_type x) { return x; } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsNan() { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsNan() { return -1; } template -static KOKKOS_FUNCTION - typename std::enable_if::is_signed, - val_type>::type - KokkosKernelsNan() { +static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type +KokkosKernelsNan() { return Kokkos::Experimental::finite_max::value; } -#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ - \ - static constexpr bool is_specialized = true; \ - static constexpr bool is_integer = true; \ - static constexpr bool is_exact = true; \ - static constexpr bool is_complex = false; \ - static constexpr bool has_infinity = false; \ - \ - using magnitudeType = mag_type; \ - using halfPrecision = val_type; \ - using doublePrecision = val_type; \ - \ - static constexpr bool isComplex = false; \ - static constexpr bool isOrdinal = true; \ - static constexpr bool isComparable = true; \ - static constexpr bool hasMachineParameters = false; \ - \ - static KOKKOS_FUNCTION val_type zero() { return static_cast(0); } \ - static KOKKOS_FUNCTION val_type one() { return static_cast(1); } \ - static KOKKOS_FUNCTION val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static KOKKOS_FUNCTION val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static KOKKOS_FUNCTION val_type infinity() { \ - return static_cast(0); \ - } \ - static KOKKOS_FUNCTION val_type nan() { \ - return KokkosKernelsNan(); \ - } \ - static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ - static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ - static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ - return KokkosKernelsAbs(x); \ - } \ - static KOKKOS_FUNCTION mag_type real(const val_type x) { \ - return Kokkos::real(x); \ - } \ - static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ - static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ - static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static KOKKOS_FUNCTION val_type sqrt(const val_type x) { \ - return static_cast(Kokkos::sqrt(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type cbrt(const val_type x) { \ - return static_cast(Kokkos::cbrt(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type exp(const val_type x) { \ - return static_cast(Kokkos::exp(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type log(const val_type x) { \ - return static_cast(Kokkos::log(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type log10(const val_type x) { \ - return static_cast(Kokkos::log10(abs(x))); \ - } \ - static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ - static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static KOKKOS_FUNCTION val_type conjugate(const val_type x) { \ - return conj(x); \ - } \ - static KOKKOS_FUNCTION bool isnaninf(const val_type) { return false; } \ - static KOKKOS_FUNCTION val_type squareroot(const val_type x) { \ - return sqrt(x); \ - } +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_integer = true; \ + static constexpr bool is_exact = true; \ + static constexpr bool is_complex = false; \ + static constexpr bool has_infinity = false; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = val_type; \ + using doublePrecision = val_type; \ + \ + static constexpr bool isComplex = false; \ + static constexpr bool isOrdinal = true; \ + static constexpr bool isComparable = true; \ + static constexpr bool hasMachineParameters = false; \ + \ + static KOKKOS_FUNCTION val_type zero() { return static_cast(0); } \ + static KOKKOS_FUNCTION val_type one() { return static_cast(1); } \ + static KOKKOS_FUNCTION val_type min() { return Kokkos::Experimental::finite_min::value; } \ + static KOKKOS_FUNCTION val_type max() { return Kokkos::Experimental::finite_max::value; } \ + static KOKKOS_FUNCTION val_type infinity() { return static_cast(0); } \ + static KOKKOS_FUNCTION val_type nan() { return KokkosKernelsNan(); } \ + static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ + static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { return KokkosKernelsAbs(x); } \ + static KOKKOS_FUNCTION mag_type real(const val_type x) { return Kokkos::real(x); } \ + static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } \ + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { return static_cast(Kokkos::sqrt(abs(x))); } \ + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { return static_cast(Kokkos::cbrt(abs(x))); } \ + static KOKKOS_FUNCTION val_type exp(const val_type x) { return static_cast(Kokkos::exp(abs(x))); } \ + static KOKKOS_FUNCTION val_type log(const val_type x) { return static_cast(Kokkos::log(abs(x))); } \ + static KOKKOS_FUNCTION val_type log10(const val_type x) { return static_cast(Kokkos::log10(abs(x))); } \ + static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } \ + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } \ + static KOKKOS_FUNCTION bool isnaninf(const val_type) { return false; } \ + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } /// \class ArithTraits /// \brief Traits class for arithmetic on type T. @@ -1103,11 +956,9 @@ class ArithTraits { using magnitudeType = mag_type; using halfPrecision = float; #if defined(__CUDA_ARCH__) - using doublePrecision = - double; // CUDA doesn't support long double, unfortunately + using doublePrecision = double; // CUDA doesn't support long double, unfortunately #elif defined(__HIP_DEVICE_COMPILE__) - using doublePrecision = - double; // HIP does not support long double unfortunately + using doublePrecision = double; // HIP does not support long double unfortunately #else using doublePrecision = long double; #endif // __CUDA_ARCH__ @@ -1230,8 +1081,7 @@ class ArithTraits > { static constexpr bool has_infinity = true; static std::complex infinity() { - return std::complex(ArithTraits::infinity(), - ArithTraits::infinity()); + return std::complex(ArithTraits::infinity(), ArithTraits::infinity()); } #ifdef KOKKOS_ENABLE_SYCL @@ -1280,37 +1130,23 @@ class ArithTraits > { return isnan(real(x)) || isnan(imag(x)); } #endif - static mag_type abs(const std::complex& x) { - return std::abs(x); - } + static mag_type abs(const std::complex& x) { return std::abs(x); } static std::complex zero() { - return std::complex(ArithTraits::zero(), - ArithTraits::zero()); + return std::complex(ArithTraits::zero(), ArithTraits::zero()); } static std::complex one() { - return std::complex(ArithTraits::one(), - ArithTraits::zero()); + return std::complex(ArithTraits::one(), ArithTraits::zero()); } static std::complex min() { - return std::complex(ArithTraits::min(), - ArithTraits::zero()); + return std::complex(ArithTraits::min(), ArithTraits::zero()); } static std::complex max() { - return std::complex(ArithTraits::max(), - ArithTraits::zero()); - } - static mag_type real(const std::complex& x) { - return std::real(x); - } - static mag_type imag(const std::complex& x) { - return std::imag(x); + return std::complex(ArithTraits::max(), ArithTraits::zero()); } - static std::complex conj( - const std::complex& x) { - return std::conj(x); - } - static std::complex pow(const std::complex& x, - const std::complex& y) { + static mag_type real(const std::complex& x) { return std::real(x); } + static mag_type imag(const std::complex& x) { return std::imag(x); } + static std::complex conj(const std::complex& x) { return std::conj(x); } + static std::complex pow(const std::complex& x, const std::complex& y) { // Fix for some weird gcc 4.2.1 inaccuracy. if (y == one()) { return x; @@ -1320,46 +1156,29 @@ class ArithTraits > { return std::pow(x, y); } } - static std::complex pow(const std::complex& x, - const RealFloatType& y) { + static std::complex pow(const std::complex& x, const RealFloatType& y) { // Fix for some weird gcc 4.2.1 inaccuracy. if (y == ArithTraits::one()) { return x; - } else if (y == ArithTraits::one() + - ArithTraits::one()) { + } else if (y == ArithTraits::one() + ArithTraits::one()) { return x * x; } else { return std::pow(x, y); } } - static std::complex sqrt( - const std::complex& x) { - return std::sqrt(x); - } - static std::complex cbrt( - const std::complex& x) { + static std::complex sqrt(const std::complex& x) { return std::sqrt(x); } + static std::complex cbrt(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::cbrt(x); #else return ::cbrt(x); #endif } - static std::complex exp(const std::complex& x) { - return std::exp(x); - } - static std::complex log(const std::complex& x) { - return std::log(x); - } - static std::complex log10( - const std::complex& x) { - return std::log10(x); - } - static std::complex sin(const std::complex& x) { - return std::sin(x); - } - static std::complex cos(const std::complex& x) { - return std::cos(x); - } + static std::complex exp(const std::complex& x) { return std::exp(x); } + static std::complex log(const std::complex& x) { return std::log(x); } + static std::complex log10(const std::complex& x) { return std::log10(x); } + static std::complex sin(const std::complex& x) { return std::sin(x); } + static std::complex cos(const std::complex& x) { return std::cos(x); } static std::complex tan(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::tan(x); @@ -1367,36 +1186,24 @@ class ArithTraits > { return std::tan(x); #endif } - static std::complex sinh( - const std::complex& x) { - return std::sinh(x); - } - static std::complex cosh( - const std::complex& x) { - return std::cosh(x); - } - static std::complex tanh( - const std::complex& x) { - return std::tanh(x); - } - static std::complex asin( - const std::complex& x) { + static std::complex sinh(const std::complex& x) { return std::sinh(x); } + static std::complex cosh(const std::complex& x) { return std::cosh(x); } + static std::complex tanh(const std::complex& x) { return std::tanh(x); } + static std::complex asin(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::asin(x); #else return ::asin(x); #endif } - static std::complex acos( - const std::complex& x) { + static std::complex acos(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::acos(x); #else return ::acos(x); #endif } - static std::complex atan( - const std::complex& x) { + static std::complex atan(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL using sycl::atan; #else @@ -1411,33 +1218,19 @@ class ArithTraits > { static mag_type epsilon() { return ArithTraits::epsilon(); } // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = - std::complex::halfPrecision>; - using doublePrecision = - std::complex::doublePrecision>; + using magnitudeType = mag_type; + using halfPrecision = std::complex::halfPrecision>; + using doublePrecision = std::complex::doublePrecision>; static constexpr bool isComplex = true; static constexpr bool isOrdinal = false; static constexpr bool isComparable = false; static constexpr bool hasMachineParameters = true; - static bool isnaninf(const std::complex& x) { - return isNan(x) || isInf(x); - } - static mag_type magnitude(const std::complex& x) { - return abs(x); - } - static std::complex conjugate( - const std::complex& x) { - return conj(x); - } - static std::string name() { - return std::string("std::complex<") + ArithTraits::name() + ">"; - } - static std::complex squareroot( - const std::complex& x) { - return sqrt(x); - } + static bool isnaninf(const std::complex& x) { return isNan(x) || isInf(x); } + static mag_type magnitude(const std::complex& x) { return abs(x); } + static std::complex conjugate(const std::complex& x) { return conj(x); } + static std::string name() { return std::string("std::complex<") + ArithTraits::name() + ">"; } + static std::complex squareroot(const std::complex& x) { return sqrt(x); } static mag_type eps() { return epsilon(); } static mag_type sfmin() { return ArithTraits::sfmin(); } static int base() { return ArithTraits::base(); } @@ -1637,9 +1430,7 @@ struct [[deprecated]] ArithTraits { static inline mag_type real(const val_type& x) { return x; } static inline mag_type imag(const val_type&) { return zero(); } static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } + static inline val_type pow(const val_type& x, const val_type& y) { return ::pow(x, y); } static inline val_type sqrt(const val_type& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::sqrt(x); @@ -1710,11 +1501,7 @@ struct [[deprecated]] ArithTraits { static int base() { return std::numeric_limits::radix; } static mag_type prec() { return eps() * base(); } static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } + static mag_type rnd() { return std::numeric_limits::round_style == std::round_to_nearest ? one() : zero(); } static int emin() { return std::numeric_limits::min_exponent; } static mag_type rmin() { return std::numeric_limits::min(); } static int emax() { return std::numeric_limits::max_exponent; } @@ -1753,9 +1540,7 @@ struct [[deprecated]] ArithTraits { static inline mag_type real(const val_type& x) { return x; } static inline mag_type imag(const val_type&) { return zero(); } static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } + static inline val_type pow(const val_type& x, const val_type& y) { return ::pow(x, y); } static inline val_type sqrt(const val_type& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::sqrt(x); @@ -1810,9 +1595,7 @@ struct [[deprecated]] ArithTraits { #endif } static inline val_type nan() { return val_type::_nan; } - static inline val_type epsilon() { - return std::numeric_limits::epsilon(); - } + static inline val_type epsilon() { return std::numeric_limits::epsilon(); } typedef qd_real magnitudeType; typedef dd_real halfPrecision; @@ -1832,11 +1615,7 @@ struct [[deprecated]] ArithTraits { static int base() { return std::numeric_limits::radix; } static mag_type prec() { return eps() * base(); } static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } + static mag_type rnd() { return std::numeric_limits::round_style == std::round_to_nearest ? one() : zero(); } static int emin() { return std::numeric_limits::min_exponent; } static mag_type rmin() { return std::numeric_limits::min(); } static int emax() { return std::numeric_limits::max_exponent; } @@ -1857,8 +1636,7 @@ struct [[deprecated]] ArithTraits { namespace Details { template -using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = - ::Kokkos::ArithTraits; +using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = ::Kokkos::ArithTraits; } // namespace Details } // namespace Kokkos diff --git a/common/src/Kokkos_InnerProductSpaceTraits.hpp b/common/src/Kokkos_InnerProductSpaceTraits.hpp index c2bc475c45..25337c925f 100644 --- a/common/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/common/src/Kokkos_InnerProductSpaceTraits.hpp @@ -125,19 +125,14 @@ class InnerProductSpaceTraits { typedef val_type dot_type; //! The "norm" (absolute value or magnitude) of a value x of type val_type. - static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } /// \brief The "dot product" of two values x and y of type val_type. /// /// This default implementation should suffice unless val_type is /// complex. In that case, see the partial specialization for /// Kokkos::complex below to see our convention for which input gets /// conjugated. - static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, - const val_type& y) { - return x * y; - } + static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; /// \brief Partial specialization for long double. @@ -149,9 +144,7 @@ struct InnerProductSpaceTraits { typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -163,13 +156,8 @@ class InnerProductSpaceTraits> { typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, - const val_type& y) { - return Kokkos::conj(x) * y; - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } + static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, const val_type& y) { return Kokkos::conj(x) * y; } }; /// \brief Partial specialization for std::complex. @@ -182,12 +170,8 @@ struct InnerProductSpaceTraits> { typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } - static dot_type dot(const val_type& x, const val_type& y) { - return std::conj(x) * y; - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } + static dot_type dot(const val_type& x, const val_type& y) { return std::conj(x) * y; } }; #ifdef HAVE_KOKKOSKERNELS_QUADMATH @@ -203,9 +187,7 @@ struct InnerProductSpaceTraits<__float128> { typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -232,9 +214,7 @@ struct InnerProductSpaceTraits { typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -244,34 +224,24 @@ struct InnerProductSpaceTraits { typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; - static mag_type norm(const val_type& x) { - return Kokkos::ArithTraits::abs(x); - } + static mag_type norm(const val_type& x) { return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; #endif // HAVE_KOKKOS_QD template -KOKKOS_INLINE_FUNCTION void updateDot(ResultType& sum, const InputType1& x, - const InputType2& y) { +KOKKOS_INLINE_FUNCTION void updateDot(ResultType& sum, const InputType1& x, const InputType2& y) { // FIXME (mfh 22 Jan 2020) We should actually pick the type with the // greater precision. sum += InnerProductSpaceTraits::dot(x, y); } -KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const double x, - const double y) { - sum += x * y; -} +KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const double x, const double y) { sum += x * y; } -KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, - const float y) { - sum += x * y; -} +KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, const float y) { sum += x * y; } // This exists because complex += complex is not defined. -KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, - const Kokkos::complex x, +KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, const Kokkos::complex x, const Kokkos::complex y) { const auto tmp = Kokkos::conj(x) * y; sum += Kokkos::complex(tmp.real(), tmp.imag()); @@ -280,8 +250,7 @@ KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, // This exists in case people call the overload of KokkosBlas::dot // that takes an output View, and the output View has element type // Kokkos::complex. -KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, - const Kokkos::complex x, +KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, const Kokkos::complex x, const Kokkos::complex y) { sum += Kokkos::conj(x) * y; } diff --git a/common/unit_test/Test_Common_AlignPtrTo.hpp b/common/unit_test/Test_Common_AlignPtrTo.hpp index 760cddd5a2..33e7ed542c 100644 --- a/common/unit_test/Test_Common_AlignPtrTo.hpp +++ b/common/unit_test/Test_Common_AlignPtrTo.hpp @@ -60,16 +60,14 @@ KOKKOS_INLINE_FUNCTION T *f1(InPtr p) { template KOKKOS_INLINE_FUNCTION T *f2(InPtr p) { std::uintptr_t ptrVal = reinterpret_cast(p); - return reinterpret_cast((ptrVal + alignof(T) - 1) / alignof(T) * - alignof(T)); + return reinterpret_cast((ptrVal + alignof(T) - 1) / alignof(T) * alignof(T)); } // the way GCC does it (roughly) template KOKKOS_INLINE_FUNCTION T *f3(InPtr p) { std::uintptr_t ptrVal = reinterpret_cast(p); - return reinterpret_cast((ptrVal - uint64_t(1) + alignof(T)) & - -alignof(T)); + return reinterpret_cast((ptrVal - uint64_t(1) + alignof(T)) & -alignof(T)); } // Function to be executed by each team @@ -81,8 +79,7 @@ struct TeamFunction { template KOKKOS_INLINE_FUNCTION void operator()(const Team &team) const { // get an "aligned" pointer to scratch memory - char *shmem = (char *)(team.team_shmem().get_shmem(team.team_size() * - sizeof(double))); + char *shmem = (char *)(team.team_shmem().get_shmem(team.team_size() * sizeof(double))); double *vals; if constexpr (0 == TEST_FN) { vals = f0(shmem); @@ -109,9 +106,7 @@ struct TeamFunction { results_(i) = vals[i]; } - size_t team_shmem_size(int team_size) const { - return team_size * sizeof(double); - } + size_t team_shmem_size(int team_size) const { return team_size * sizeof(double); } Results results_; }; @@ -119,20 +114,18 @@ struct TeamFunction { // use atomic add to set result(i) = i template void test_alignPtrTo() { - using MemorySpace = typename Device::memory_space; - using ExecSpace = typename Device::execution_space; - using TestView = Kokkos::View; - using TestPolicy = Kokkos::TeamPolicy; - const int teamSize = TestPolicy(1, Kokkos::AUTO) - .team_size_max(TeamFunction(), - Kokkos::ParallelForTag{}); + using MemorySpace = typename Device::memory_space; + using ExecSpace = typename Device::execution_space; + using TestView = Kokkos::View; + using TestPolicy = Kokkos::TeamPolicy; + const int teamSize = + TestPolicy(1, Kokkos::AUTO).team_size_max(TeamFunction(), Kokkos::ParallelForTag{}); ExecSpace space; TestView results("TestView", teamSize); TestPolicy policy(space, 1, teamSize); - Kokkos::parallel_for("test alignment", policy, - TeamFunction(results)); + Kokkos::parallel_for("test alignment", policy, TeamFunction(results)); int errs; Kokkos::parallel_reduce( diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 8c493a3666..73a4ebfefe 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -42,8 +42,7 @@ } #if 0 -#define TRACE() \ - Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); +#define TRACE() Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); #else #define TRACE() #endif @@ -133,8 +132,7 @@ class ArithTraitsTesterBase { /// \brief Combine two intermediate reduction results into \c dst. /// /// Subclasses need not and must not override this method. - KOKKOS_INLINE_FUNCTION void join(value_type& dst, - const value_type& src) const { + KOKKOS_INLINE_FUNCTION void join(value_type& dst, const value_type& src) const { dst = dst && src; // dst = 1; } @@ -157,8 +155,7 @@ class ArithTraitsTesterBase { /// far. On output: The result of the tests run in this method. /// The result of more than one test is the logical AND of each /// test's result. - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // not using this argument @@ -293,14 +290,12 @@ class ArithTraitsTesterBase { } if (AT::is_integer != std::numeric_limits::is_integer) { - out << "AT::is_integer != std::numeric_limits::is_integer" - << endl; + out << "AT::is_integer != std::numeric_limits::is_integer" << endl; FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { - out << "AT::is_exact != std::numeric_limits::is_exact" - << endl; + out << "AT::is_exact != std::numeric_limits::is_exact" << endl; FAILURE(); } @@ -354,11 +349,9 @@ class ArithTraitsTesterBase { if (AT::has_infinity) { // Compiler intrinsic casts from inf of type half_t / bhalf_t to inf // of type float in CUDA, SYCL and HIP do not work yet. -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) namespace KE = Kokkos::Experimental; - if constexpr (!std::is_same::value && - !std::is_same::value) { + if constexpr (!std::is_same::value && !std::is_same::value) { #else { #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP @@ -396,10 +389,8 @@ class ArithTraitsTesterBase { /// implements transcendental functions, but the specific tests that /// are run will depend on \c ScalarType. template ::value ? 1 : 0)> -class ArithTraitsTesterTranscendentalBase - : public ArithTraitsTesterBase { + const int has_transcendentals = (HasTranscendentals::value ? 1 : 0)> +class ArithTraitsTesterTranscendentalBase : public ArithTraitsTesterBase { private: //! The base class of this class. typedef ArithTraitsTesterBase base_type; @@ -413,8 +404,7 @@ class ArithTraitsTesterTranscendentalBase /// \brief The "parallel for" part of the reduction. /// /// See comments of ArithTraitsTesterBase's operator(). - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const; + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const; //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase(); @@ -445,8 +435,7 @@ class ArithTraitsTesterTranscendentalBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); // typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -524,8 +513,7 @@ class ArithTraitsTesterTranscendentalBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -666,8 +654,7 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { - Kokkos::printf( - "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); + Kokkos::printf("AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); FAILURE(); } } @@ -685,13 +672,11 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { - Kokkos::printf( - "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); + Kokkos::printf("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { - Kokkos::printf( - "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); + Kokkos::printf("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); FAILURE(); } } else { @@ -788,8 +773,7 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(three, three); if (result != twentySeven) { - out << "AT::pow (three, three) = " << result - << " != twentySeven = " << twentySeven << endl; + out << "AT::pow (three, three) = " << result << " != twentySeven = " << twentySeven << endl; FAILURE(); } } @@ -798,20 +782,17 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_signed && !AT::is_complex) { result = AT::pow(-three, one); if (result != -three) { - out << "AT::pow (-three, one) = " << result << " != -three = " << -three - << endl; + out << "AT::pow (-three, one) = " << result << " != -three = " << -three << endl; FAILURE(); } result = AT::pow(-three, two); if (result != nine) { - out << "AT::pow (-three, two) = " << result << " != nine = " << nine - << endl; + out << "AT::pow (-three, two) = " << result << " != nine = " << nine << endl; FAILURE(); } result = AT::pow(-three, three); if (result != -twentySeven) { - out << "AT::pow (-three, three) = " << result - << " != -twentySeven = " << twentySeven << endl; + out << "AT::pow (-three, three) = " << result << " != -twentySeven = " << twentySeven << endl; FAILURE(); } } @@ -877,8 +858,7 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { - Kokkos::printf( - "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); + Kokkos::printf("AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); FAILURE(); } } @@ -896,13 +876,11 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { - Kokkos::printf( - "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); + Kokkos::printf("AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { - Kokkos::printf( - "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); + Kokkos::printf("AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); FAILURE(); } } else { @@ -956,10 +934,8 @@ class ArithTraitsTesterTranscendentalBase /// Some tests will be executed whether or not ScalarType is /// complex, but the specific tests that are run will depend on /// ScalarType. -template ::is_complex> -class ArithTraitsTesterComplexBase - : public ArithTraitsTesterTranscendentalBase { +template ::is_complex> +class ArithTraitsTesterComplexBase : public ArithTraitsTesterTranscendentalBase { private: //! The base class of this class. typedef ArithTraitsTesterTranscendentalBase base_type; @@ -973,8 +949,7 @@ class ArithTraitsTesterComplexBase /// \brief The "parallel for" part of the reduction. /// /// See comments of ArithTraitsTesterBase's operator(). - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const; + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const; //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase(); @@ -1004,8 +979,7 @@ class ArithTraitsTesterComplexBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1029,9 +1003,9 @@ class ArithTraitsTesterComplexBase } #endif // KOKKOS_HALF_T_IS_FLOAT - if (AT::is_complex) { - FAILURE(); - } + if (AT::is_complex) { + FAILURE(); + } // Call the base class' implementation. Every subclass' // implementation of operator() must do this, in order to include @@ -1090,8 +1064,7 @@ class ArithTraitsTesterComplexBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1109,8 +1082,7 @@ class ArithTraitsTesterComplexBase const ScalarType onePlusOne(one, one); // Test conjugation. - if (AT::conj(oneMinusOne) != onePlusOne || - AT::conj(onePlusOne) != oneMinusOne) { + if (AT::conj(oneMinusOne) != onePlusOne || AT::conj(onePlusOne) != oneMinusOne) { FAILURE(); } @@ -1178,16 +1150,12 @@ class ArithTraitsTesterComplexBase /// (testHost()). The device-based test is a reduction over redundant /// executions of the test. All redundant executions must return /// '1' (passed). -template ::is_exact> +template ::is_exact> class ArithTraitsTesterFloatingPointBase - : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { + : public ArithTraitsTesterComplexBase::is_complex> { private: //! The base class of this class. - typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> - base_type; + typedef ArithTraitsTesterComplexBase::is_complex> base_type; public: typedef DeviceType execution_space; @@ -1198,8 +1166,7 @@ class ArithTraitsTesterFloatingPointBase /// \brief The "parallel for" part of the reduction. /// /// See comments of ArithTraitsTesterBase's operator(). - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const; + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const; protected: virtual int testHostImpl(std::ostream& out) const; @@ -1211,13 +1178,10 @@ class ArithTraitsTesterFloatingPointBase // template class ArithTraitsTesterFloatingPointBase - : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { + : public ArithTraitsTesterComplexBase::is_complex> { private: //! The base class of this class. - typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> - base_type; + typedef ArithTraitsTesterComplexBase::is_complex> base_type; public: typedef typename DeviceType::execution_space execution_space; @@ -1228,8 +1192,7 @@ class ArithTraitsTesterFloatingPointBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1242,11 +1205,9 @@ class ArithTraitsTesterFloatingPointBase // Compiler intrinsic casts from nan of type half_t / bhalf_t to nan // of type float in CUDA, SYCL and HIP do not work yet. -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) namespace KE = Kokkos::Experimental; - if constexpr (!std::is_same::value && - !std::is_same::value) { + if constexpr (!std::is_same::value && !std::is_same::value) { #else { #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP @@ -1267,8 +1228,7 @@ class ArithTraitsTesterFloatingPointBase Kokkos::printf("1 is Inf\n"); FAILURE(); } -#if defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP if constexpr (!std::is_same_v) { if (AT::isNan(zero)) { Kokkos::printf("0 is NaN\n"); @@ -1377,13 +1337,10 @@ class ArithTraitsTesterFloatingPointBase // template class ArithTraitsTesterFloatingPointBase - : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { + : public ArithTraitsTesterComplexBase::is_complex> { private: //! The base class of this class. - typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> - base_type; + typedef ArithTraitsTesterComplexBase::is_complex> base_type; public: typedef typename DeviceType::execution_space execution_space; @@ -1394,8 +1351,7 @@ class ArithTraitsTesterFloatingPointBase //! Constructor (does nothing, but marked as device function). KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase() {} - KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, - value_type& dst) const { + KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable @@ -1464,8 +1420,7 @@ class ArithTraitsTesterFloatingPointBase /// executions of the test. All redundant executions must return /// '1' (passed). template -class ArithTraitsTester - : public ArithTraitsTesterFloatingPointBase { +class ArithTraitsTester : public ArithTraitsTesterFloatingPointBase { public: typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; @@ -1491,11 +1446,9 @@ int testArithTraitsOnDevice(std::ostream& out, const int verbose) { using std::endl; typedef ArithTraitsTester functor_type; int success = 1; // output argument of parallel_reduce - Kokkos::parallel_reduce("KokkosKernels::Common::Test::ArithTraitsOnDevice", 1, - functor_type(), success); + Kokkos::parallel_reduce("KokkosKernels::Common::Test::ArithTraitsOnDevice", 1, functor_type(), success); if (success) { - if (verbose) - out << Kokkos::ArithTraits::name() << " passed" << endl; + if (verbose) out << Kokkos::ArithTraits::name() << " passed" << endl; } else { out << Kokkos::ArithTraits::name() << " FAILED" << endl; } @@ -1517,8 +1470,7 @@ int testArithTraitsOnHost(std::ostream& out, const int verbose) { const int localSuccess = f.testHost(out); if (localSuccess) { - if (verbose) - out << Kokkos::ArithTraits::name() << " passed" << endl; + if (verbose) out << Kokkos::ArithTraits::name() << " passed" << endl; } else { out << Kokkos::ArithTraits::name() << " FAILED" << endl; } @@ -1558,8 +1510,7 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice(out, verbose); + curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; @@ -1587,8 +1538,7 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice(out, verbose); + curSuccess = testArithTraitsOnDevice(out, verbose); // // Built-in real floating-point types @@ -1596,10 +1546,8 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { #if defined(KOKKOS_HALF_T_IS_FLOAT) TRACE(); - success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice( - out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnDevice(out, verbose); #endif // KOKKOS_HALF_T_IS_FLOAT success = success && curSuccess; curSuccess = testArithTraitsOnDevice(out, verbose); @@ -1610,12 +1558,10 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { // Kokkos' complex floating-point types // - success = success && curSuccess; - curSuccess = - testArithTraitsOnDevice, DeviceType>(out, verbose); success = success && curSuccess; - curSuccess = testArithTraitsOnDevice, DeviceType>( - out, verbose); + curSuccess = testArithTraitsOnDevice, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnDevice, DeviceType>(out, verbose); return success && curSuccess; } @@ -1682,8 +1628,7 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { success = success && curSuccess; curSuccess = testArithTraitsOnHost(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnHost(out, verbose); + curSuccess = testArithTraitsOnHost(out, verbose); // // Built-in real and complex floating-point types @@ -1693,20 +1638,16 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { curSuccess = testArithTraitsOnHost(out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost(out, verbose); -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) // This would spill tons of warnings about host device stuff otherwise success = success && curSuccess; curSuccess = testArithTraitsOnHost(out, verbose); success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); - success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); success = success && curSuccess; - curSuccess = testArithTraitsOnHost, DeviceType>( - out, verbose); + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); #endif // // Kokkos' complex floating-point types @@ -1715,15 +1656,12 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { #if defined(KOKKOS_HALF_T_IS_FLOAT) success = success && curSuccess; TRACE(); - curSuccess = testArithTraitsOnHost( - out, verbose); + curSuccess = testArithTraitsOnHost(out, verbose); #endif // KOKKOS_HALF_T_IS_FLOAT - success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); - success = success && curSuccess; - curSuccess = - testArithTraitsOnHost, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); + success = success && curSuccess; + curSuccess = testArithTraitsOnHost, DeviceType>(out, verbose); // success = success && curSuccess; curSuccess = // testArithTraitsOnHost, DeviceType> (out, // verbose); diff --git a/common/unit_test/Test_Common_Error.hpp b/common/unit_test/Test_Common_Error.hpp index 375f75b5ff..139231d63f 100644 --- a/common/unit_test/Test_Common_Error.hpp +++ b/common/unit_test/Test_Common_Error.hpp @@ -20,8 +20,7 @@ #include "KokkosKernels_Error.hpp" void test_kokkoskernels_throw() { - const std::string my_throw_msg = - "Testing Kokkos Kernels' throw_runtime_exception."; + const std::string my_throw_msg = "Testing Kokkos Kernels' throw_runtime_exception."; try { KokkosKernels::Impl::throw_runtime_exception(my_throw_msg); } catch (const std::runtime_error& e) { diff --git a/common/unit_test/Test_Common_Iota.hpp b/common/unit_test/Test_Common_Iota.hpp index af3b6502bf..ee1e33fda8 100644 --- a/common/unit_test/Test_Common_Iota.hpp +++ b/common/unit_test/Test_Common_Iota.hpp @@ -76,13 +76,11 @@ void test_iota_rank() { template void test_iota_non_const_value_type() { - static_assert( - std::is_same_v::non_const_value_type, T>, - "Iota's non-const value type should be same as non-const type provided"); - static_assert( - std::is_same_v::non_const_value_type, T>, - "Iota's non-const value type should be same as non-const version of " - "const type provided"); + static_assert(std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const type provided"); + static_assert(std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const version of " + "const type provided"); } template @@ -98,10 +96,8 @@ void test_iota_subview() { template void test_is_iota() { - static_assert(KokkosKernels::Impl::is_iota_v>, - "Iota should be an Iota"); - static_assert(!KokkosKernels::Impl::is_iota_v, - "int should not be an Iota"); + static_assert(KokkosKernels::Impl::is_iota_v>, "Iota should be an Iota"); + static_assert(!KokkosKernels::Impl::is_iota_v, "int should not be an Iota"); } template diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index 23574087ff..d471801a30 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -21,8 +21,7 @@ #include template -size_t std_lower_bound(const std::vector &haystack, - const Ordinal needle) { +size_t std_lower_bound(const std::vector &haystack, const Ordinal needle) { const auto it = std::lower_bound(haystack.begin(), haystack.end(), needle); return it - haystack.begin(); } @@ -33,9 +32,7 @@ struct ThreadLowerBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - ThreadLowerBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + ThreadLowerBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} KOKKOS_INLINE_FUNCTION @@ -43,8 +40,7 @@ struct ThreadLowerBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::lower_bound_thread(haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, - __LINE__, int(i), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); ++lerrCount; } } @@ -56,13 +52,11 @@ struct ThreadLowerBoundFunctor { }; template -void test_lower_bound_thread(const std::vector &_haystack, - const T &_needle) { +void test_lower_bound_thread(const std::vector &_haystack, const T &_needle) { using execution_space = typename Device::execution_space; using Policy = Kokkos::RangePolicy; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using size_type = typename u_const_view_t::size_type; // get expected value @@ -76,9 +70,7 @@ void test_lower_bound_thread(const std::vector &_haystack, // test lower_bound search int errCount; // run a single thread - Kokkos::parallel_reduce(Policy(0, 1), - ThreadLowerBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(0, 1), ThreadLowerBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -89,18 +81,14 @@ struct TeamLowerBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - TeamLowerBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + TeamLowerBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} - KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, - int &lerrCount) const { - hv_size_type idx = - KokkosKernels::lower_bound_team(handle, haystack_, needle_); + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, int &lerrCount) const { + hv_size_type idx = KokkosKernels::lower_bound_team(handle, haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, - int(handle.team_rank()), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); ++lerrCount; } } @@ -116,8 +104,7 @@ void test_lower_bound_team(const std::vector &_haystack, const T _needle) { using Policy = Kokkos::TeamPolicy; using Member = typename Policy::member_type; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using size_type = typename u_const_view_t::size_type; // get expected value @@ -130,13 +117,10 @@ void test_lower_bound_team(const std::vector &_haystack, const T _needle) { // test lower_bound search const int leagueSize = 1; - const int teamSize = - KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; int errCount; - Kokkos::parallel_reduce( - Policy(leagueSize, teamSize), - TeamLowerBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(leagueSize, teamSize), + TeamLowerBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -218,38 +202,31 @@ void test_lower_bound() { } } -#define EXECUTE_TEST(T, DEVICE) \ - TEST_F(TestCategory, common##_##lower_bound##_##T##_##DEVICE) { \ - test_lower_bound(); \ - } +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##lower_bound##_##T##_##DEVICE) { test_lower_bound(); } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, TestDevice) #endif diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index 6638c6e398..4f59a8857b 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -56,8 +56,6 @@ void testPrintConfiguration() { check_print_configuration(out); } -TEST_F(TestCategory, common_print_configuration) { - testPrintConfiguration(); -} +TEST_F(TestCategory, common_print_configuration) { testPrintConfiguration(); } #endif // KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index e93a9d0939..30623a8691 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -33,8 +33,7 @@ // Then prefix-sum into randomOffsets. // This simulates a CRS rowmap or other batched sorting scenario template -size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, - size_t n, size_t avg) { +size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t n, size_t avg) { srand(54321); auto countsHost = Kokkos::create_mirror_view(randomCounts); size_t total = 0; @@ -47,8 +46,7 @@ size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, } Kokkos::deep_copy(randomCounts, countsHost); Kokkos::deep_copy(randomOffsets, randomCounts); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - n, randomOffsets); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(n, randomOffsets); return total; } @@ -87,8 +85,7 @@ double getRandom() { template <> Coordinates getRandom() { - return Coordinates(getRandom(), getRandom(), - getRandom()); + return Coordinates(getRandom(), getRandom(), getRandom()); } // Specialize for Kokkos::complex, with the real and imaginary parts different @@ -99,9 +96,7 @@ struct kvHash { template struct kvHash> { - Kokkos::complex operator()(const Key& k) { - return Kokkos::complex(3 * k + 4, k - 10.4); - } + Kokkos::complex operator()(const Key& k) { return Kokkos::complex(3 * k + 4, k - 10.4); } }; template @@ -133,14 +128,12 @@ struct TestSerialRadixFunctor { using Key = typename KeyView::value_type; using UnsignedKey = typename std::make_unsigned::type; - TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_, - OrdView& offsets_) + TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_, OrdView& offsets_) : keys(keys_), keysAux(keysAux_), counts(counts_), offsets(offsets_) {} KOKKOS_INLINE_FUNCTION void operator()(const int i) const { int off = offsets(i); - KokkosKernels::SerialRadixSort( - (UnsignedKey*)keys.data() + off, (UnsignedKey*)keysAux.data() + off, - counts(i)); + KokkosKernels::SerialRadixSort((UnsignedKey*)keys.data() + off, + (UnsignedKey*)keysAux.data() + off, counts(i)); } KeyView keys; KeyView keysAux; @@ -155,20 +148,14 @@ struct TestSerialRadix2Functor { using UnsignedKey = typename std::make_unsigned::type; using Value = typename ValView::value_type; - TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, - ValView& valuesAux_, OrdView& counts_, + TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) - : keys(keys_), - keysAux(keysAux_), - values(values_), - valuesAux(valuesAux_), - counts(counts_), - offsets(offsets_) {} + : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) {} KOKKOS_INLINE_FUNCTION void operator()(const int i) const { int off = offsets(i); - KokkosKernels::SerialRadixSort2( - (UnsignedKey*)keys.data() + off, (UnsignedKey*)keysAux.data() + off, - values.data() + off, valuesAux.data() + off, counts(i)); + KokkosKernels::SerialRadixSort2((UnsignedKey*)keys.data() + off, + (UnsignedKey*)keysAux.data() + off, values.data() + off, + valuesAux.data() + off, counts(i)); } KeyView keys; KeyView keysAux; @@ -188,8 +175,7 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); KeyView keys("Radix sort testing data", n); fillRandom(keys); // Sort using std::sort on host to do correctness test @@ -198,22 +184,17 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { KeyView keysAux("Radix sort aux data", n); // Run the sorting on device in all sub-arrays in parallel typedef Kokkos::RangePolicy range_policy; - Kokkos::parallel_for( - range_policy(0, k), - TestSerialRadixFunctor(keys, keysAux, counts, offsets)); + Kokkos::parallel_for(range_policy(0, k), TestSerialRadixFunctor(keys, keysAux, counts, offsets)); exec_space().fence(); - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); for (size_t i = 0; i < k; i++) { Key* begin = gold.data() + offsetsHost(i); Key* end = begin + countsHost(i); std::sort(begin, end); } // Copy actual result to host and compare - auto keysHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); for (size_t i = 0; i < n; i++) { ASSERT_EQ(keysHost(i), gold(i)); } @@ -230,8 +211,7 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); KeyView keys("Radix test keys", n); ValView data("Radix test data", n); // The keys are randomized @@ -243,25 +223,20 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { // Run the sorting on device in all sub-arrays in parallel typedef Kokkos::RangePolicy range_policy; // Deliberately using a weird number for vector length - Kokkos::parallel_for(range_policy(0, k), - TestSerialRadix2Functor( - keys, keysAux, data, dataAux, counts, offsets)); + Kokkos::parallel_for(range_policy(0, k), TestSerialRadix2Functor( + keys, keysAux, data, dataAux, counts, offsets)); exec_space().fence(); // Sort using std::sort on host to do correctness test - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); for (size_t i = 0; i < k; i++) { Key* begin = gold.data() + offsetsHost(i); Key* end = begin + countsHost(i); std::sort(begin, end); } // Copy results to host - auto keysHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); - auto dataHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); // Make sure keys are sorted exactly (stability of sort doesn't matter) for (size_t i = 0; i < n; i++) { ASSERT_EQ(keysHost(i), gold(i)); @@ -283,8 +258,7 @@ struct TestTeamBitonicFunctor { template KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::TeamBitonicSort( - values.data() + offsets(i), counts(i), t); + KokkosKernels::TeamBitonicSort(values.data() + offsets(i), counts(i), t); } ValView values; @@ -297,15 +271,14 @@ struct TestTeamBitonic2Functor { typedef typename KeyView::value_type Key; typedef typename ValView::value_type Value; - TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, - OrdView& offsets_) + TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_) : keys(keys_), values(values_), counts(counts_), offsets(offsets_) {} template KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::TeamBitonicSort2( - keys.data() + offsets(i), values.data() + offsets(i), counts(i), t); + KokkosKernels::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), + counts(i), t); } KeyView keys; @@ -324,25 +297,21 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); ValView data("Bitonic sort testing data", n); fillRandom(data); Kokkos::View gold("Host sorted", n); Kokkos::deep_copy(gold, data); // Run the sorting on device in all sub-arrays in parallel - Kokkos::parallel_for( - Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonicFunctor(data, counts, offsets)); + Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), + TestTeamBitonicFunctor(data, counts, offsets)); // Copy result to host auto dataHost = Kokkos::create_mirror_view(data); Kokkos::deep_copy(dataHost, data); // Sort using std::sort on host to do correctness test exec_space().fence(); - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); for (size_t i = 0; i < k; i++) { Scalar* begin = gold.data() + offsetsHost(i); Scalar* end = begin + countsHost(i); @@ -364,8 +333,7 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); KeyView keys("Bitonic test keys", n); ValView data("Bitonic test data", n); // The keys are randomized @@ -375,13 +343,10 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { // Run the sorting on device in all sub-arrays in parallel, just using vector // loops Deliberately using a weird number for vector length Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonic2Functor( - keys, data, counts, offsets)); + TestTeamBitonic2Functor(keys, data, counts, offsets)); exec_space().fence(); - auto countsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); // Sort using std::sort on host to do correctness test for (size_t i = 0; i < k; i++) { Key* begin = gold.data() + offsetsHost(i); @@ -389,10 +354,8 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { std::sort(begin, end); } // Copy results to host - auto keysHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); - auto dataHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); // Make sure keys are sorted exactly (stability of sort doesn't matter) for (size_t i = 0; i < n; i++) { ASSERT_EQ(keysHost(i), gold(i)); @@ -423,8 +386,7 @@ void testBitonicSort(size_t n) { fillRandom(data); KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), - CheckSortedFunctor(data), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckSortedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); } @@ -443,10 +405,7 @@ struct CheckOrderedFunctor { template struct CompareDescending { - KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs, - const Scalar rhs) const { - return lhs > rhs; - } + KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs, const Scalar rhs) const { return lhs > rhs; } }; template @@ -462,15 +421,13 @@ void testBitonicSortDescending() { fillRandom(data); KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), - CheckOrderedFunctor(data), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); } struct LexCompare { - KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs, - const Coordinates rhs) const { + KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs, const Coordinates rhs) const { if (lhs.x < rhs.x) return true; else if (lhs.x > rhs.x) @@ -497,8 +454,7 @@ void testBitonicSortLexicographic() { fillRandom(data); KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), - CheckOrderedFunctor(data), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); } @@ -520,8 +476,7 @@ TEST_F(TestCategory, common_serial_radix2) { for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) { testSerialRadixSort2(numArrays, arrayMax); testSerialRadixSort2(numArrays, arrayMax); - testSerialRadixSort2>(numArrays, - arrayMax); + testSerialRadixSort2>(numArrays, arrayMax); } } @@ -542,8 +497,7 @@ TEST_F(TestCategory, common_team_bitonic2) { for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { testTeamBitonicSort2(numArrays, arrayMax); testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2>(numArrays, - arrayMax); + testTeamBitonicSort2>(numArrays, arrayMax); } } diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index aace02a738..abd4cf655a 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -21,8 +21,7 @@ #include template -size_t std_upper_bound(const std::vector &haystack, - const Ordinal needle) { +size_t std_upper_bound(const std::vector &haystack, const Ordinal needle) { const auto it = std::upper_bound(haystack.begin(), haystack.end(), needle); return it - haystack.begin(); } @@ -33,9 +32,7 @@ struct ThreadUpperBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - ThreadUpperBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + ThreadUpperBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} KOKKOS_INLINE_FUNCTION @@ -43,8 +40,7 @@ struct ThreadUpperBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, - __LINE__, int(i), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); ++lerrCount; } } @@ -56,13 +52,11 @@ struct ThreadUpperBoundFunctor { }; template -void test_upper_bound_thread(const std::vector &_haystack, - const T &_needle) { +void test_upper_bound_thread(const std::vector &_haystack, const T &_needle) { using execution_space = typename Device::execution_space; using Policy = Kokkos::RangePolicy; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using hv_size_type = typename u_const_view_t::size_type; // get expected value @@ -76,9 +70,7 @@ void test_upper_bound_thread(const std::vector &_haystack, // test upper_bound search int errCount; // run a single thread - Kokkos::parallel_reduce(Policy(0, 1), - ThreadUpperBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(0, 1), ThreadUpperBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -89,18 +81,14 @@ struct TeamUpperBoundFunctor { using hv_value_type = typename HaystackView::non_const_value_type; using hv_size_type = typename HaystackView::size_type; - TeamUpperBoundFunctor(const hv_size_type &expected, - const HaystackView &haystack, - const hv_value_type &needle) + TeamUpperBoundFunctor(const hv_size_type &expected, const HaystackView &haystack, const hv_value_type &needle) : expected_(expected), haystack_(haystack), needle_(needle) {} - KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, - int &lerrCount) const { - hv_size_type idx = - KokkosKernels::upper_bound_team(handle, haystack_, needle_); + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, int &lerrCount) const { + hv_size_type idx = KokkosKernels::upper_bound_team(handle, haystack_, needle_); if (idx != expected_) { - Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, - int(handle.team_rank()), int(expected_), int(idx)); + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); ++lerrCount; } } @@ -116,8 +104,7 @@ void test_upper_bound_team(const std::vector &_haystack, const T _needle) { using Policy = Kokkos::TeamPolicy; using Member = typename Policy::member_type; using view_t = Kokkos::View; - using u_const_view_t = Kokkos::View>; + using u_const_view_t = Kokkos::View>; using hv_size_type = typename u_const_view_t::size_type; // get expected value @@ -130,13 +117,10 @@ void test_upper_bound_team(const std::vector &_haystack, const T _needle) { // test upper_bound search const int leagueSize = 1; - const int teamSize = - KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; int errCount; - Kokkos::parallel_reduce( - Policy(leagueSize, teamSize), - TeamUpperBoundFunctor(expected, haystack, _needle), - errCount); + Kokkos::parallel_reduce(Policy(leagueSize, teamSize), + TeamUpperBoundFunctor(expected, haystack, _needle), errCount); EXPECT_EQ(0, errCount); } @@ -209,38 +193,31 @@ void test_upper_bound() { } } -#define EXECUTE_TEST(T, DEVICE) \ - TEST_F(TestCategory, common##_##upper_bound##_##T##_##DEVICE) { \ - test_upper_bound(); \ - } +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##upper_bound##_##T##_##DEVICE) { test_upper_bound(); } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, TestDevice) #endif diff --git a/common/unit_test/Test_Common_Version.hpp b/common/unit_test/Test_Common_Version.hpp index cb5265cfef..e2a5faeee2 100644 --- a/common/unit_test/Test_Common_Version.hpp +++ b/common/unit_test/Test_Common_Version.hpp @@ -42,8 +42,7 @@ void test_version_info() { static_assert(false, "KOKKOSKERNELS_VERSION_PATCH macro is not defined!"); #endif - static_assert(KOKKOSKERNELS_VERSION == (KOKKOSKERNELS_VERSION_MAJOR * 10000 + - KOKKOSKERNELS_VERSION_MINOR * 100 + + static_assert(KOKKOSKERNELS_VERSION == (KOKKOSKERNELS_VERSION_MAJOR * 10000 + KOKKOSKERNELS_VERSION_MINOR * 100 + KOKKOSKERNELS_VERSION_PATCH)); } diff --git a/common/unit_test/Test_Common_float128.hpp b/common/unit_test/Test_Common_float128.hpp index 846a5ef879..063fd06d80 100644 --- a/common/unit_test/Test_Common_float128.hpp +++ b/common/unit_test/Test_Common_float128.hpp @@ -32,7 +32,7 @@ #include #include -//#include +// #include #include #include @@ -55,9 +55,8 @@ std::ostream& operator<<(std::ostream& out, const __float128& x) { const int numCharPrinted = quadmath_snprintf(buf, bufSize, "%.30Qe", x); if (static_cast(numCharPrinted) >= bufSize) { std::ostringstream os; - os << "Failed to print __float128 value: buffer has " << bufSize - << " characters, but quadmath_snprintf wanted " << numCharPrinted - << " characters!"; + os << "Failed to print __float128 value: buffer has " << bufSize << " characters, but quadmath_snprintf wanted " + << numCharPrinted << " characters!"; throw std::runtime_error(os.str()); } out << buf; @@ -79,8 +78,7 @@ void testfloat128() { << "y = " << y << endl << "z = " << z << endl << "(double) z = " << static_cast(z) << endl - << "z - (double) z = " - << (z - static_cast<__float128>(static_cast(z))) << endl; + << "z - (double) z = " << (z - static_cast<__float128>(static_cast(z))) << endl; // FIXME (mfh 04 Sep 2015) The results of printing could depend on // the locale. This works fine for the default locale on my system. @@ -89,8 +87,7 @@ void testfloat128() { os << x; if (os.str() != "1.000000000000000000000000000000e+00") { success = false; - cout << "'_float128 x = 1.0' does not print correctly! It prints as " - << os.str() << "." << endl; + cout << "'_float128 x = 1.0' does not print correctly! It prints as " << os.str() << "." << endl; } } { diff --git a/common/unit_test/Test_Common_set_bit_count.hpp b/common/unit_test/Test_Common_set_bit_count.hpp index 6e2c6e80b6..7b6c996390 100644 --- a/common/unit_test/Test_Common_set_bit_count.hpp +++ b/common/unit_test/Test_Common_set_bit_count.hpp @@ -37,21 +37,17 @@ template struct ppctest { view_type view; typename view_type::non_const_type out_view; - ppctest(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ppctest(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION - void operator()(const size_t row) const { - out_view(row) = pop_count(view(row)); - } + void operator()(const size_t row) const { out_view(row) = pop_count(view(row)); } }; template struct ppccheck { view_type view; typename view_type::non_const_type out_view; - ppccheck(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ppccheck(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t row) const { @@ -69,8 +65,7 @@ view_type get_array_bit_count(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::GetArrayBitCount", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::GetArrayBitCount", my_exec_space(0, view.extent(0)), ppctest(view, out_view)); Kokkos::fence(); return out_view; @@ -81,8 +76,7 @@ view_type check_array_bit_count(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::CheckArrayBitCount", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::CheckArrayBitCount", my_exec_space(0, view.extent(0)), ppccheck(view, out_view)); Kokkos::fence(); return out_view; @@ -92,8 +86,7 @@ template struct ffstest { view_type view; typename view_type::non_const_type out_view; - ffstest(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ffstest(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t row) const { @@ -108,8 +101,7 @@ template struct ffscheck { view_type view; typename view_type::non_const_type out_view; - ffscheck(view_type view_, typename view_type::non_const_type out_view_) - : view(view_), out_view(out_view_) {} + ffscheck(view_type view_, typename view_type::non_const_type out_view_) : view(view_), out_view(out_view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t row) const { @@ -130,8 +122,7 @@ view_type get_ffs(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::GetFFS", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::GetFFS", my_exec_space(0, view.extent(0)), ffstest(view, out_view)); Kokkos::fence(); return out_view; @@ -142,8 +133,7 @@ view_type check_ffs(view_type view) { typename view_type::non_const_type out_view("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Common::Test::CheckFFS", - my_exec_space(0, view.extent(0)), + Kokkos::parallel_for("KokkosKernels::Common::Test::CheckFFS", my_exec_space(0, view.extent(0)), ffscheck(view, out_view)); Kokkos::fence(); return out_view; @@ -159,8 +149,7 @@ void test_set_bit_count() { nonconstview count_bit_view("count_bit_view", array_size); - typename nonconstview::HostMirror hview = - Kokkos::create_mirror_view(count_bit_view); + typename nonconstview::HostMirror hview = Kokkos::create_mirror_view(count_bit_view); for (int i = 0; i < array_size; ++i) { hview(i) = lno_t(rand()) * lno_t(rand()); @@ -170,18 +159,13 @@ void test_set_bit_count() { // KokkosKernels::Impl::kk_print_1Dview(count_bit_view); - myview out1 = - Test::get_array_bit_count( - count_bit_view); - myview out2 = - Test::check_array_bit_count( - count_bit_view); + myview out1 = Test::get_array_bit_count(count_bit_view); + myview out2 = Test::check_array_bit_count(count_bit_view); // KokkosKernels::Impl::kk_print_1Dview(out1); // KokkosKernels::Impl::kk_print_1Dview(out2); - bool is_identical = KokkosKernels::Impl::kk_is_identical_view< - myview, myview, typename myview::value_type, - typename device::execution_space>(out1, out2, 0); + bool is_identical = KokkosKernels::Impl::kk_is_identical_view(out1, out2, 0); EXPECT_TRUE(is_identical); } @@ -193,8 +177,7 @@ void test_ffs() { nonconstview count_bit_view("count_bit_view", array_size); - typename nonconstview::HostMirror hview = - Kokkos::create_mirror_view(count_bit_view); + typename nonconstview::HostMirror hview = Kokkos::create_mirror_view(count_bit_view); for (int i = 0; i < array_size; ++i) { hview(i) = lno_t(rand()) * lno_t(rand()); @@ -204,16 +187,13 @@ void test_ffs() { // KokkosKernels::Impl::kk_print_1Dview(count_bit_view); - myview out1 = - Test::get_ffs(count_bit_view); - myview out2 = - Test::check_ffs(count_bit_view); + myview out1 = Test::get_ffs(count_bit_view); + myview out2 = Test::check_ffs(count_bit_view); // KokkosKernels::Impl::kk_print_1Dview(out1); // KokkosKernels::Impl::kk_print_1Dview(out2); - bool is_identical = KokkosKernels::Impl::kk_is_identical_view< - myview, myview, typename myview::value_type, - typename device::execution_space>(out1, out2, 0); + bool is_identical = KokkosKernels::Impl::kk_is_identical_view(out1, out2, 0); EXPECT_TRUE(is_identical); } diff --git a/example/batched_solve/examples_helper.hpp b/example/batched_solve/examples_helper.hpp index 3010f66ba8..2bbe93fdfb 100644 --- a/example/batched_solve/examples_helper.hpp +++ b/example/batched_solve/examples_helper.hpp @@ -62,12 +62,8 @@ /// template -void create_saddle_point_matrices(const MatrixViewType &A, - const VectorViewType &Y, - const int n_dim = 3) { - Kokkos::Random_XorShift64_Pool< - typename MatrixViewType::device_type::execution_space> - random(13718); +void create_saddle_point_matrices(const MatrixViewType &A, const VectorViewType &Y, const int n_dim = 3) { + Kokkos::Random_XorShift64_Pool random(13718); const int N = A.extent(0); const int n = A.extent(1); const int n_2 = n_dim + 1; @@ -76,12 +72,8 @@ void create_saddle_point_matrices(const MatrixViewType &A, MatrixViewType xs("xs", N, n_1, n_dim); VectorViewType ys("ys", N, n_1); - Kokkos::fill_random( - xs, random, - Kokkos::reduction_identity::prod()); - Kokkos::fill_random( - ys, random, - Kokkos::reduction_identity::prod()); + Kokkos::fill_random(xs, random, Kokkos::reduction_identity::prod()); + Kokkos::fill_random(ys, random, Kokkos::reduction_identity::prod()); auto xs_host = Kokkos::create_mirror_view(xs); auto ys_host = Kokkos::create_mirror_view(ys); @@ -94,8 +86,8 @@ void create_saddle_point_matrices(const MatrixViewType &A, for (int i = 0; i < n_1; ++i) { for (int j = 0; j < n_1; ++j) { for (int l = 0; l < N; ++l) { - auto xs_i = Kokkos::subview(xs_host, l, i, Kokkos::ALL); - auto xs_j = Kokkos::subview(xs_host, l, j, Kokkos::ALL); + auto xs_i = Kokkos::subview(xs_host, l, i, Kokkos::ALL); + auto xs_j = Kokkos::subview(xs_host, l, j, Kokkos::ALL); typename MatrixViewType::value_type d = 0; for (int k = 0; k < n_dim; ++k) d += Kokkos::pow(xs_i(k) - xs_j(k), 2); d = Kokkos::sqrt(d); @@ -125,21 +117,12 @@ void create_saddle_point_matrices(const MatrixViewType &A, } template -void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, - const int N, const IntView &r, - const IntView &c, - const VectorViewType &D, - const VectorViewType &X, +void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, const int N, const IntView &r, + const IntView &c, const VectorViewType &D, const VectorViewType &X, const VectorViewType &B) { - Kokkos::Random_XorShift64_Pool< - typename VectorViewType::device_type::execution_space> - random(13718); - Kokkos::fill_random( - X, random, - Kokkos::reduction_identity::prod()); - Kokkos::fill_random( - B, random, - Kokkos::reduction_identity::prod()); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, Kokkos::reduction_identity::prod()); + Kokkos::fill_random(B, random, Kokkos::reduction_identity::prod()); auto D_host = Kokkos::create_mirror_view(D); auto r_host = Kokkos::create_mirror_view(r); @@ -181,8 +164,7 @@ void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, } template -void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, - const VType &diag) { +void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, const VType &diag) { auto diag_values_host = Kokkos::create_mirror_view(diag); auto values_host = Kokkos::create_mirror_view(V); auto row_ptr_host = Kokkos::create_mirror_view(r); @@ -197,8 +179,7 @@ void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, int BlkSize = diag.extent(1); for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } for (int j = 0; j < N; ++j) { diff --git a/example/batched_solve/static_pivoting.cpp b/example/batched_solve/static_pivoting.cpp index e8a25778fc..f8eabdee22 100644 --- a/example/batched_solve/static_pivoting.cpp +++ b/example/batched_solve/static_pivoting.cpp @@ -49,9 +49,7 @@ struct Functor_TeamTestStaticPivoting { const XYViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_TeamTestStaticPivoting(const AViewType &A, const XYViewType &X, - const XYViewType &Y) - : _A(A), _X(X), _Y(Y) {} + Functor_TeamTestStaticPivoting(const AViewType &A, const XYViewType &X, const XYViewType &Y) : _A(A), _X(X), _Y(Y) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -61,22 +59,16 @@ struct Functor_TeamTestStaticPivoting { auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); member.team_barrier(); - KokkosBatched::TeamGesv::invoke(member, - A, X, - Y); + KokkosBatched::TeamGesv::invoke(member, A, X, Y); member.team_barrier(); } inline void run() { std::string name("KokkosBatched::Test::StaticPivoting"); - Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO(), Kokkos::AUTO()); - using MatrixViewType = - Kokkos::View; + using MatrixViewType = Kokkos::View; const int n = _A.extent(1); size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); @@ -95,8 +87,7 @@ struct Functor_SerialTestStaticPivoting { const XYViewType _Y; KOKKOS_INLINE_FUNCTION - Functor_SerialTestStaticPivoting(const AViewType &A, const AViewType &tmp, - const XYViewType &X, const XYViewType &Y) + Functor_SerialTestStaticPivoting(const AViewType &A, const AViewType &tmp, const XYViewType &X, const XYViewType &Y) : _A(A), _tmp(tmp), _X(X), _Y(Y) {} KOKKOS_INLINE_FUNCTION void operator()(const int &matrix_id) const { @@ -104,8 +95,7 @@ struct Functor_SerialTestStaticPivoting { auto tmp = Kokkos::subview(_tmp, matrix_id, Kokkos::ALL, Kokkos::ALL); auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); - KokkosBatched::SerialGesv::invoke( - A, X, Y, tmp); + KokkosBatched::SerialGesv::invoke(A, X, Y, tmp); } inline void run() { @@ -144,12 +134,9 @@ int main(int /*argc*/, char ** /*argv[]*/) { KokkosKernels::Impl::kk_write_3Dview_to_file(A, "A.txt"); KokkosKernels::Impl::kk_write_2Dview_to_file(Y, "Y.txt"); - Functor_SerialTestStaticPivoting(A, tmp, - X, Y) - .run(); + Functor_SerialTestStaticPivoting(A, tmp, X, Y).run(); KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_serial.txt"); - Functor_TeamTestStaticPivoting(A2, X, Y2) - .run(); + Functor_TeamTestStaticPivoting(A2, X, Y2).run(); KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_team.txt"); } Kokkos::finalize(); diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp index b543ddaad6..ab14b4b07a 100644 --- a/example/batched_solve/team_GMRES.cpp +++ b/example/batched_solve/team_GMRES.cpp @@ -40,8 +40,8 @@ typedef Kokkos::DefaultExecutionSpace exec_space; -template +template struct Functor_TestBatchedTeamVectorGMRES { const ValuesViewType _values; const ValuesViewType _diag; @@ -53,10 +53,9 @@ struct Functor_TestBatchedTeamVectorGMRES { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES( - const ValuesViewType &values, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int team_size, - const int vector_length, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &values, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int team_size, + const int vector_length, KrylovHandleType &handle) : _values(values), _r(r), _c(c), @@ -67,11 +66,9 @@ struct Functor_TestBatchedTeamVectorGMRES { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES( - const ValuesViewType &values, const ValuesViewType &diag, - const IntView &r, const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int team_size, const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &values, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int team_size, const int vector_length, KrylovHandleType &handle) : _values(values), _diag(diag), _r(r), @@ -86,61 +83,42 @@ struct Functor_TestBatchedTeamVectorGMRES { KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; - - auto d = Kokkos::subview( - _values, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - using ScratchPadIntViewType = - Kokkos::View; - using ScratchPadValuesViewType = Kokkos::View< - typename ValuesViewType::non_const_value_type **, - typename ValuesViewType::array_layout, - typename ValuesViewType::execution_space::scratch_memory_space>; - - using Operator = - KokkosBatched::CrsMatrix; - - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); - - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview(_values, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntViewType = Kokkos::View; + using ScratchPadValuesViewType = + Kokkos::View; + + using Operator = KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); + + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); if (UsePrec) { - ScratchPadValuesViewType diag( - member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + ScratchPadValuesViewType diag(member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); using PrecOperator = KokkosBatched::JacobiPrec; KokkosBatched::TeamVectorCopy::invoke( - member, - Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL), - diag); + member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -149,10 +127,8 @@ struct Functor_TestBatchedTeamVectorGMRES { Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -168,21 +144,17 @@ struct Functor_TestBatchedTeamVectorGMRES { using ViewType2D = Kokkos::View; - size_t bytes_1D = - ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), 1); + size_t bytes_1D = ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), 1); size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); - size_t bytes_2D_1 = ViewType2D::shmem_size( - _handle.get_number_of_systems_per_team(), _X.extent(1)); - size_t bytes_2D_2 = ViewType2D::shmem_size( - _handle.get_number_of_systems_per_team(), maximum_iteration + 1); + size_t bytes_2D_1 = ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), maximum_iteration + 1); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); exec_space().fence(); timer.reset(); @@ -221,8 +193,7 @@ int main(int /*argc*/, char ** /*argv*/) { printf("N = %d, Blk = %d, nnz = %d\n", N, Blk, nnz); - create_tridiagonal_batched_matrices(nnz, Blk, N, rowOffsets, colIndices, - values, x, y); + create_tridiagonal_batched_matrices(nnz, Blk, N, rowOffsets, colIndices, values, x, y); // Replace y by ones: Kokkos::deep_copy(y, 1.); @@ -242,9 +213,7 @@ int main(int /*argc*/, char ** /*argv*/) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KokkosBatched::KrylovHandle; + using KrylovHandleType = KokkosBatched::KrylovHandle; const int N_team = 2; const int n_iterations = 150; @@ -255,8 +224,7 @@ int main(int /*argc*/, char ** /*argv*/) { const int ortho_strategy = 0; KrylovHandleType handle(N, N_team, n_iterations, true); - handle.Arnoldi_view = - Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); + handle.Arnoldi_view = Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); handle.set_max_iteration(n_iterations); handle.set_tolerance(tol); @@ -265,37 +233,27 @@ int main(int /*argc*/, char ** /*argv*/) { handle.set_compute_last_residual(true); double time = - Functor_TestBatchedTeamVectorGMRES(values, diag, rowOffsets, - colIndices, x, y, team_size, - vector_length, handle) + Functor_TestBatchedTeamVectorGMRES( + values, diag, rowOffsets, colIndices, x, y, team_size, vector_length, handle) .run(); printf("times = %f secondes\n", time); for (int i = 0; i < N; ++i) { if (handle.is_converged_host(i)) { - std::cout - << "System " << i << " converged in " - << handle.get_iteration_host(i) - << " iterations, the initial absolute norm of the residual was " - << handle.get_norm_host(i, 0) << " and is now " - << handle.get_last_norm_host(i) << std::endl; + std::cout << "System " << i << " converged in " << handle.get_iteration_host(i) + << " iterations, the initial absolute norm of the residual was " << handle.get_norm_host(i, 0) + << " and is now " << handle.get_last_norm_host(i) << std::endl; } else { - std::cout - << "System " << i << " did not converge in " - << handle.get_max_iteration() - << " iterations, the initial absolute norm of the residual was " - << handle.get_norm_host(i, 0) << " and is now " - << handle.get_last_norm_host(i) << std::endl; + std::cout << "System " << i << " did not converge in " << handle.get_max_iteration() + << " iterations, the initial absolute norm of the residual was " << handle.get_norm_host(i, 0) + << " and is now " << handle.get_last_norm_host(i) << std::endl; } } if (handle.is_converged_host()) std::cout << "All the systems have converged." << std::endl; else - std::cout << "There is at least one system that did not converge." - << std::endl; + std::cout << "There is at least one system that did not converge." << std::endl; } Kokkos::finalize(); } diff --git a/example/gmres/ex_real_A.cpp b/example/gmres/ex_real_A.cpp index 14c4eaeb15..f18ccfd278 100644 --- a/example/gmres/ex_real_A.cpp +++ b/example/gmres/ex_real_A.cpp @@ -31,16 +31,14 @@ int main(int argc, char* argv[]) { using CRS = KokkosSparse::CrsMatrix; using ViewVectorType = Kokkos::View; - using KernelHandle = - KokkosKernels::Experimental::KokkosKernelsHandle; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; std::string filename("bcsstk09.mtx"); // example matrix std::string ortho("CGS2"); // orthog type int m = 50; // Max subspace size before restarting. - double convTol = 1e-10; // Relative residual convergence tolerance. - int cycLim = 50; // Maximum number of times to restart the solver. - bool rand_rhs = false; // Generate random right-hand side. + double convTol = 1e-10; // Relative residual convergence tolerance. + int cycLim = 50; // Maximum number of times to restart the solver. + bool rand_rhs = false; // Generate random right-hand side. for (int i = 1; i < argc; ++i) { const std::string& token = argv[i]; @@ -51,29 +49,26 @@ int main(int argc, char* argv[]) { if (token == std::string("--ortho")) ortho = argv[++i]; if (token == std::string("--rand_rhs")) rand_rhs = true; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos GMRES solver options:" << std::endl - << "--filename : The name of a matrix market (.mtx) file for " - "matrix A (Default bcsstk09.mtx)." - << std::endl - << "--max-subsp : The maximum size of the Kyrlov subspace before " - "restarting (Default 50)." - << std::endl - << "--max-restarts: Maximum number of GMRES restarts (Default 50)." - << std::endl - << "--tol : Convergence tolerance. (Default 1e-10)." - << std::endl - << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " - "(Default 'CGS2')" - << std::endl - << "--rand_rhs : Generate a random right-hand side b. (Else, " - "default uses b = vector of ones.)" - << std::endl - << "--help -h : Display this help message." << std::endl - << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " - "1e-5 --max-subsp 100 " - << std::endl - << std::endl; + std::cout << "Kokkos GMRES solver options:" << std::endl + << "--filename : The name of a matrix market (.mtx) file for " + "matrix A (Default bcsstk09.mtx)." + << std::endl + << "--max-subsp : The maximum size of the Kyrlov subspace before " + "restarting (Default 50)." + << std::endl + << "--max-restarts: Maximum number of GMRES restarts (Default 50)." << std::endl + << "--tol : Convergence tolerance. (Default 1e-10)." << std::endl + << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " + "(Default 'CGS2')" + << std::endl + << "--rand_rhs : Generate a random right-hand side b. (Else, " + "default uses b = vector of ones.)" + << std::endl + << "--help -h : Display this help message." << std::endl + << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " + "1e-5 --max-subsp 100 " + << std::endl + << std::endl; return 0; } } @@ -98,10 +93,8 @@ int main(int argc, char* argv[]) { auto gmres_handle = kh.get_gmres_handle(); // Get full gmres handle type using decltype. Deferencing a pointer gives a // reference, so we need to strip that too. - using GMRESHandle = - typename std::remove_reference::type; - gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 - : GMRESHandle::Ortho::MGS); + using GMRESHandle = typename std::remove_reference::type; + gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 : GMRESHandle::Ortho::MGS); if (rand_rhs) { // Make rhs random. @@ -128,8 +121,7 @@ int main(int argc, char* argv[]) { std::cout << "=========================================" << std::endl; std::cout << "Verify from main: Ending residual is " << endRes << std::endl; std::cout << "Number of iterations is: " << numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " - << endRelRes - endRes << std::endl; + std::cout << "Diff of residual from main - residual from solver: " << endRelRes - endRes << std::endl; std::cout << "Convergence flag is : " << convFlag << std::endl; } Kokkos::finalize(); diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index 8d1ff74b87..942dc176b6 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -27,14 +27,10 @@ int main(int argc, char* argv[]) { using OT = int; using EXSP = Kokkos::DefaultExecutionSpace; using MESP = typename EXSP::memory_space; - using CRS = - KokkosSparse::CrsMatrix, void, OT>; + using CRS = KokkosSparse::CrsMatrix, void, OT>; - using ViewVectorType = - Kokkos::View>; - using KernelHandle = - KokkosKernels::Experimental::KokkosKernelsHandle; + using ViewVectorType = Kokkos::View>; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; std::string ortho("CGS2"); // orthog type int n = 1000; // Matrix size @@ -53,29 +49,26 @@ int main(int argc, char* argv[]) { if (token == std::string("--ortho")) ortho = argv[++i]; if (token == std::string("--rand_rhs")) rand_rhs = true; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos GMRES solver options:" << std::endl - << "--mat-size : The size of the nxn test matrix. (Default: " - "n=1000.)" - << std::endl - << "--max-subsp : The maximum size of the Kyrlov subspace before " - "restarting (Default 50)." - << std::endl - << "--max-restarts: Maximum number of GMRES restarts (Default 50)." - << std::endl - << "--tol : Convergence tolerance. (Default 1e-10)." - << std::endl - << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " - "(Default 'CGS2')" - << std::endl - << "--rand_rhs : Generate a random right-hand side b. (Else, " - "default uses b = vector of ones.)" - << std::endl - << "--help -h : Display this help message." << std::endl - << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " - "1e-5 --max-subsp 100 " - << std::endl - << std::endl; + std::cout << "Kokkos GMRES solver options:" << std::endl + << "--mat-size : The size of the nxn test matrix. (Default: " + "n=1000.)" + << std::endl + << "--max-subsp : The maximum size of the Kyrlov subspace before " + "restarting (Default 50)." + << std::endl + << "--max-restarts: Maximum number of GMRES restarts (Default 50)." << std::endl + << "--tol : Convergence tolerance. (Default 1e-10)." << std::endl + << "--ortho : Type of orthogonalization. Use 'CGS2' or 'MGS'. " + "(Default 'CGS2')" + << std::endl + << "--rand_rhs : Generate a random right-hand side b. (Else, " + "default uses b = vector of ones.)" + << std::endl + << "--help -h : Display this help message." << std::endl + << "Example Call : ./Gmres.exe --filename Laplace3D100.mtx --tol " + "1e-5 --max-subsp 100 " + << std::endl + << std::endl; return 0; } } @@ -87,18 +80,16 @@ int main(int argc, char* argv[]) { auto gmres_handle = kh.get_gmres_handle(); // Get full gmres handle type using decltype. Deferencing a pointer gives a // reference, so we need to strip that too. - using GMRESHandle = - typename std::remove_reference::type; - gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 - : GMRESHandle::Ortho::MGS); + using GMRESHandle = typename std::remove_reference::type; + gmres_handle->set_ortho(ortho == "CGS2" ? GMRESHandle::Ortho::CGS2 : GMRESHandle::Ortho::MGS); // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse. - CRS A = KokkosSparse::Impl::kk_generate_diag_matrix(n); - auto myPrec = new KokkosSparse::Experimental::MatrixPrec( - KokkosSparse::Impl::kk_generate_diag_matrix(n, true)); + CRS A = KokkosSparse::Impl::kk_generate_diag_matrix(n); + auto myPrec = + new KokkosSparse::Experimental::MatrixPrec(KokkosSparse::Impl::kk_generate_diag_matrix(n, true)); ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), n); // Solution and initial guess @@ -107,9 +98,8 @@ int main(int argc, char* argv[]) { n); // right-hand side vec int rand_seed = 123; Kokkos::Random_XorShift64_Pool<> pool(rand_seed); - Kokkos::fill_random( - X, pool, -1, - 1); // Use non-zero initial guess to test GMRES properties. + Kokkos::fill_random(X, pool, -1, + 1); // Use non-zero initial guess to test GMRES properties. if (rand_rhs) { Kokkos::fill_random(B, pool, -1, 1); } else { @@ -131,8 +121,7 @@ int main(int argc, char* argv[]) { std::cout << "=========================================" << std::endl; std::cout << "Verify from main: Ending residual is " << endRes << std::endl; std::cout << "Number of iterations is: " << numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " - << endRelRes - endRes << std::endl; + std::cout << "Diff of residual from main - residual from solver: " << endRelRes - endRes << std::endl; std::cout << "Convergence flag is : " << convFlag << std::endl; if (endRes < convTol && numIters == 1) { pass = true; diff --git a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp index 9a5537ee5b..5506ce68d8 100644 --- a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp +++ b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp @@ -100,8 +100,7 @@ struct Parameters { } }; -void print_options(std::ostream& os, const char* app_name, - unsigned int indent = 0) { +void print_options(std::ostream& os, const char* app_name, unsigned int indent = 0) { std::string spaces(indent, ' '); os << "Usage:" << std::endl << spaces << " " << app_name << " [parameters]" << std::endl @@ -110,14 +109,11 @@ void print_options(std::ostream& os, const char* app_name, << spaces << " Parallelism (select one of the following):" << std::endl << spaces << " --serial Execute serially." << std::endl << spaces << " --threads Use N posix threads." << std::endl - << spaces << " --openmp Use OpenMP with N threads." - << std::endl + << spaces << " --openmp Use OpenMP with N threads." << std::endl << spaces << " --cuda Use CUDA" << std::endl << std::endl << spaces << " Required Parameters:" << std::endl - << spaces - << " --amtx Input file in Matrix Market format (.mtx)." - << std::endl + << spaces << " --amtx Input file in Matrix Market format (.mtx)." << std::endl << std::endl << spaces << " --algorithm Set the algorithm to use. " @@ -173,16 +169,12 @@ void print_options(std::ostream& os, const char* app_name, << " --verbose-level Set verbosity level [0..5] " "where N > 0 means print verbose messags." << std::endl - << spaces << " Default: 0" - << std::endl - << spaces - << " --help Print out command line help." - << std::endl + << spaces << " Default: 0" << std::endl + << spaces << " --help Print out command line help." << std::endl << spaces << " " << std::endl; } -int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, - char** argv) { +int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, char** argv) { bool got_required_param_amtx = false; bool got_required_param_algorithm = false; @@ -208,40 +200,32 @@ int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, params.verbose_level = atoi(argv[++i]); params.verbose_level = std::min(5, params.verbose_level); params.verbose_level = std::max(0, params.verbose_level); - } else if (0 == - Test::string_compare_no_case(argv[i], "--output-histogram")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--output-histogram")) { params.output_histogram = 1; - } else if (0 == - Test::string_compare_no_case(argv[i], "--output-graphviz")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--output-graphviz")) { params.output_graphviz = 1; - } else if (0 == Test::string_compare_no_case( - argv[i], "--output-graphviz-vert-max")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--output-graphviz-vert-max")) { params.output_graphviz_vert_max = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) { ++i; - if (0 == - Test::string_compare_no_case(argv[i], "COLORING_D2_MATRIX_SQUARED")) { + if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_MATRIX_SQUARED")) { params.algorithm = 1; got_required_param_algorithm = true; - } else if (0 == - Test::string_compare_no_case(argv[i], "COLORING_D2_SERIAL")) { + } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_SERIAL")) { params.algorithm = 2; got_required_param_algorithm = true; } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB") || 0 == Test::string_compare_no_case(argv[i], "COLORING_D2")) { params.algorithm = 3; got_required_param_algorithm = true; - } else if (0 == - Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT")) { + } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT")) { params.algorithm = 4; got_required_param_algorithm = true; - } else if (0 == Test::string_compare_no_case(argv[i], - "COLORING_D2_VB_BIT_EF")) { + } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT_EF")) { params.algorithm = 5; got_required_param_algorithm = true; } else { - std::cerr << "2-Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "2-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; print_options(std::cout, argv[0]); return 1; } @@ -250,8 +234,7 @@ int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, print_options(std::cout, argv[0]); return 1; } else { - std::cerr << "3-Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; print_options(std::cout, argv[0]); return 1; } @@ -263,21 +246,19 @@ int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, return 1; } if (!got_required_param_algorithm) { - std::cout << "Missing required parameter algorithm" << std::endl - << std::endl; + std::cout << "Missing required parameter algorithm" << std::endl << std::endl; print_options(std::cout, argv[0]); return 1; } - if (!params.use_serial && !params.use_threads && !params.use_openmp && - !params.use_cuda) { + if (!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) { print_options(std::cout, argv[0]); return 1; } return 0; } -template +template void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { using namespace KokkosGraph; using namespace KokkosGraph::Experimental; @@ -285,14 +266,13 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { int algorithm = params.algorithm; int shmemsize = params.shmemsize; - using lno_view_type = typename CrsGraph_type::row_map_type::non_const_type; - using lno_nnz_view_type = - typename CrsGraph_type::entries_type::non_const_type; + using lno_view_type = typename CrsGraph_type::row_map_type::non_const_type; + using lno_nnz_view_type = typename CrsGraph_type::entries_type::non_const_type; using size_type = typename lno_view_type::non_const_value_type; using lno_type = typename lno_nnz_view_type::non_const_value_type; - using KernelHandle_type = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_type, kk_scalar_type, ExecSpace, TempMemSpace, - PersistentMemSpace>; + using KernelHandle_type = + KokkosKernels::Experimental::KokkosKernelsHandle; // Create a kernel handle KernelHandle_type kh; @@ -333,52 +313,39 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { break; } - std::cout << std::endl - << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl; + std::cout << std::endl << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl; // ------------------------------------------ // Call the distance-2 graph coloring routine // ------------------------------------------ - graph_compute_distance2_color(&kh, crsGraph.numRows(), num_cols, - crsGraph.row_map, crsGraph.entries, - crsGraph.row_map, crsGraph.entries); + graph_compute_distance2_color(&kh, crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, + crsGraph.entries); // ------------------------------------------ // Get the results // ------------------------------------------ - size_t num_colors = - kh.get_distance2_graph_coloring_handle()->get_num_colors(); - size_t num_phases = - kh.get_distance2_graph_coloring_handle()->get_num_phases(); + size_t num_colors = kh.get_distance2_graph_coloring_handle()->get_num_colors(); + size_t num_phases = kh.get_distance2_graph_coloring_handle()->get_num_phases(); if (params.verbose_level > 0) { - std::cout - << "Total Time: " - << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() - << std::endl - << "Num colors: " - << kh.get_distance2_graph_coloring_handle()->get_num_colors() - << std::endl - << "Num Phases: " - << kh.get_distance2_graph_coloring_handle()->get_num_phases() - << std::endl - << "Colors:\n\t"; - KokkosKernels::Impl::print_1Dview( - kh.get_distance2_graph_coloring_handle()->get_vertex_colors()); + std::cout << "Total Time: " << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() << std::endl + << "Num colors: " << kh.get_distance2_graph_coloring_handle()->get_num_colors() << std::endl + << "Num Phases: " << kh.get_distance2_graph_coloring_handle()->get_num_phases() << std::endl + << "Colors:\n\t"; + KokkosKernels::Impl::print_1Dview(kh.get_distance2_graph_coloring_handle()->get_vertex_colors()); std::cout << std::endl; } // ------------------------------------------ // Save coloring to a GraphViz file // ------------------------------------------ - if (params.output_graphviz && - crsGraph.numRows() <= params.output_graphviz_vert_max) { + if (params.output_graphviz && crsGraph.numRows() <= params.output_graphviz_vert_max) { auto colors = kh.get_distance2_graph_coloring_handle()->get_vertex_colors(); std::ofstream os("G.dot", std::ofstream::out); - kh.get_distance2_graph_coloring_handle()->dump_graphviz( - os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors); + kh.get_distance2_graph_coloring_handle()->dump_graphviz(os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, + colors); } // ------------------------------------------ @@ -394,29 +361,22 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color( &kh, crsGraph.numRows(), // crsGraph.numCols(), - num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, - crsGraph.entries, d2_coloring_validation_flags); + num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, crsGraph.entries, d2_coloring_validation_flags); // Print out messages based on coloring validation check. if (d2_coloring_is_valid) { - std::cout << std::endl - << "Distance-2 Graph Coloring is VALID" << std::endl - << std::endl; + std::cout << std::endl << "Distance-2 Graph Coloring is VALID" << std::endl << std::endl; } else { str_color_is_valid = "INVALID"; std::cout << std::endl << "Distance-2 Graph Coloring is NOT VALID" << std::endl - << " - Vert(s) left uncolored : " - << d2_coloring_validation_flags[1] << std::endl - << " - Invalid D2 Coloring : " - << d2_coloring_validation_flags[2] << std::endl + << " - Vert(s) left uncolored : " << d2_coloring_validation_flags[1] << std::endl + << " - Invalid D2 Coloring : " << d2_coloring_validation_flags[2] << std::endl << std::endl; } if (d2_coloring_validation_flags[3]) { - std::cout << "Distance-2 Graph Coloring may have poor quality." - << std::endl - << " - Vert(s) have high color value : " - << d2_coloring_validation_flags[3] << std::endl + std::cout << "Distance-2 Graph Coloring may have poor quality." << std::endl + << " - Vert(s) have high color value : " << d2_coloring_validation_flags[3] << std::endl << std::endl; } } @@ -425,27 +385,24 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { // Print out a histogram of the colors // ------------------------------------------ if (0 != params.output_histogram) { - KokkosGraph::Impl::graph_print_distance2_color_histogram( - &kh, crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries, - crsGraph.row_map, crsGraph.entries, false); + KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh, crsGraph.numRows(), num_cols, crsGraph.row_map, + crsGraph.entries, crsGraph.row_map, crsGraph.entries, + false); } // ------------------------------------------ // Print out a summary // ------------------------------------------ std::string mtx_bin_file = params.mtx_bin_file; - mtx_bin_file = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1); + mtx_bin_file = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1); std::cout << "Summary" << std::endl << "-------" << std::endl - << " KExecSName : " << Kokkos::DefaultExecutionSpace::name() - << std::endl + << " KExecSName : " << Kokkos::DefaultExecutionSpace::name() << std::endl << " Filename : " << mtx_bin_file << std::endl << " Num Verts : " << crsGraph.numRows() << std::endl - << " Num Edges : " << crsGraph.entries.extent(0) - << std::endl - << " Concurrency : " - << Kokkos::DefaultExecutionSpace().concurrency() << std::endl + << " Num Edges : " << crsGraph.entries.extent(0) << std::endl + << " Concurrency : " << Kokkos::DefaultExecutionSpace().concurrency() << std::endl << " Algorithm : " << label_algorithm << std::endl << "Coloring Stats" << std::endl << " Num colors : " << num_colors << std::endl @@ -455,26 +412,21 @@ void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { } // run_example() -template +template void driver(Parameters params) { using myExecSpace = exec_space; using myFastDevice = Kokkos::Device; - using crstmat_type = - typename KokkosSparse::CrsMatrix; - using graph_type = typename crstmat_type::StaticCrsGraphType; - using data_type = typename graph_type::data_type; + using crstmat_type = typename KokkosSparse::CrsMatrix; + using graph_type = typename crstmat_type::StaticCrsGraphType; + using data_type = typename graph_type::data_type; char* mat_file = params.mtx_bin_file; - crstmat_type crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix(mat_file); + crstmat_type crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(mat_file); graph_type crsgraph = crsmat.graph; data_type num_cols = crsmat.numCols(); - KokkosKernels::Example::run_example( + KokkosKernels::Example::run_example( crsgraph, num_cols, params); } // driver() @@ -494,13 +446,10 @@ int main(int argc, char* argv[]) { return 0; } - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads + const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided + // as number of threads const int device_id = 0; - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); + Kokkos::initialize(Kokkos::InitializationSettings().set_num_threads(num_threads).set_device_id(device_id)); // Print out information about the configuration of the run if verbose_level // >= 5 @@ -510,22 +459,19 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_OPENMP) if (params.use_openmp) { - KokkosKernels::Example::driver(params); + KokkosKernels::Example::driver(params); } #endif #if defined(KOKKOS_ENABLE_CUDA) if (params.use_cuda) { - KokkosKernels::Example::driver(params); + KokkosKernels::Example::driver(params); } #endif #if defined(KOKKOS_ENABLE_SERIAL) if (params.use_serial) { - KokkosKernels::Example::driver(params); + KokkosKernels::Example::driver(params); } #endif diff --git a/example/graph/PartitioningExample.cpp b/example/graph/PartitioningExample.cpp index 1bef46cd28..7f06b216d3 100644 --- a/example/graph/PartitioningExample.cpp +++ b/example/graph/PartitioningExample.cpp @@ -28,7 +28,7 @@ using std::cout; using std::vector; -//#include "../../src/sparse/impl/KokkosSparse_partitioning_impl.hpp" +// #include "../../src/sparse/impl/KokkosSparse_partitioning_impl.hpp" int main(int argc, char* argv[]) { /* diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index 238fdef187..cf3b5767f7 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -40,18 +40,15 @@ void do_xpy(size_t n, bool time_only = false) { View y_rand("y_rand", n); View expected("expected", n); - View relative_error( - "relative_error", n); + View relative_error("relative_error", n); typename ViewType::HostMirror x_host = create_mirror_view(x); typename ViewType::HostMirror y_host = create_mirror_view(y); // TODO: Report segfault in random_pool creation with: // typename ViewType::HostMirror y_host = create_mirror_view(y_host); Random_XorShift64_Pool random_pool(12345); - fill_random(x_rand, random_pool, ReferenceScalarType(1.0), - ReferenceScalarType(2.0)); - fill_random(y_rand, random_pool, ReferenceScalarType(1.0), - ReferenceScalarType(2.0)); + fill_random(x_rand, random_pool, ReferenceScalarType(1.0), ReferenceScalarType(2.0)); + fill_random(y_rand, random_pool, ReferenceScalarType(1.0), ReferenceScalarType(2.0)); ExecutionSpace().fence(); deep_copy(x, x_rand); @@ -72,22 +69,18 @@ void do_xpy(size_t n, bool time_only = false) { if (!time_only) { for (size_t i = 0; i < n; i++) - expected(i) = static_cast(y_host(i)) + - static_cast(x_host(i)); + expected(i) = static_cast(y_host(i)) + static_cast(x_host(i)); } deep_copy(x_host, x); ExecutionSpace().fence(); - std::cout << "n: " << n << ", " << typeid(ScalarType).name() - << " Runtime(s): " << s << std::endl; + std::cout << "n: " << n << ", " << typeid(ScalarType).name() << " Runtime(s): " << s << std::endl; if (!time_only) { - std::cout << "n: " << n << ", " << typeid(ScalarType).name() - << " Relative Errors:" << std::endl; + std::cout << "n: " << n << ", " << typeid(ScalarType).name() << " Relative Errors:" << std::endl; for (size_t i = 0; i < n; i++) { - std::cout << ", " << std::abs(expected(i) - x_host(i)) / expected(i) - << std::endl; + std::cout << ", " << std::abs(expected(i) - x_host(i)) / expected(i) << std::endl; } std::cout << std::endl << std::endl; } diff --git a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp index 065c103cef..52de73fe29 100644 --- a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp +++ b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp @@ -52,8 +52,7 @@ typedef struct params { namespace KokkosKernels { namespace Experiment { -template +template struct functorTestHashmapAccumulator { typedef ExecutionSpace execution_space; typedef typename Kokkos::View data_view_t; @@ -65,17 +64,12 @@ struct functorTestHashmapAccumulator { const size_t _max_hash_entries; const parameters_t& _params; - typedef Kokkos::Experimental::UniqueToken< - execution_space, Kokkos::Experimental::UniqueTokenScope::Global> + typedef Kokkos::Experimental::UniqueToken unique_token_t; unique_token_t tokens; - functorTestHashmapAccumulator(const size_t num_entries, - const data_view_t& data, - uniform_memory_pool_t memory_pool, - const size_t hash_size, - const size_t max_hash_entries, - const parameters_t& params) + functorTestHashmapAccumulator(const size_t num_entries, const data_view_t& data, uniform_memory_pool_t memory_pool, + const size_t hash_size, const size_t max_hash_entries, const parameters_t& params) : _num_entries(num_entries), _data(data), _memory_pool(memory_pool), @@ -104,9 +98,7 @@ struct functorTestHashmapAccumulator { } scalar_t* ptr_memory_pool_chunk = (scalar_t*)(ptr_temp); - KokkosKernels::Experimental::HashmapAccumulator< - hash_size_type, hash_key_type, hash_value_type> - hash_map; + KokkosKernels::Experimental::HashmapAccumulator hash_map; // Set pointer to hash indices scalar_t* used_hash_indices = (scalar_t*)(ptr_temp); @@ -145,9 +137,8 @@ struct functorTestHashmapAccumulator { // Compute the hash index using & instead of % (modulus is slower). scalar_t hash = key & hash_func_pow2; - int r = hash_map.sequential_insert_into_hash_TrackHashes( - hash, key, &used_hash_size, hash_map.max_value_size, &used_hash_count, - used_hash_indices); + int r = hash_map.sequential_insert_into_hash_TrackHashes(hash, key, &used_hash_size, hash_map.max_value_size, + &used_hash_count, used_hash_indices); // Check return code if (r) { @@ -180,9 +171,7 @@ struct functorTestHashmapAccumulator { template void experiment(const parameters_t& params) { - typedef - typename KokkosKernels::Impl::UniformMemoryPool - uniform_memory_pool_t; + typedef typename KokkosKernels::Impl::UniformMemoryPool uniform_memory_pool_t; typedef typename Kokkos::View data_view_t; typedef typename data_view_t::HostMirror data_view_hostmirror_t; @@ -224,9 +213,8 @@ void experiment(const parameters_t& params) { // Set Hash Table Parameters size_t max_hash_entries = max_value; // Max number of entries that can be // inserted (values allowed are 1..100) - size_t hash_size_hint = - max_value; // How many hash keys are allowed. The actual hash size will - // be set to the next power of 2 bigger than hash_size_hint. + size_t hash_size_hint = max_value; // How many hash keys are allowed. The actual hash size will + // be set to the next power of 2 bigger than hash_size_hint. // Set the hash_size as the next power of 2 bigger than hash_size_hint. // - hash_size must be a power of two since we use & rather than % (which is @@ -237,8 +225,7 @@ void experiment(const parameters_t& params) { } // Create Uniform Initialized Memory Pool - KokkosKernels::Impl::PoolType pool_type = - KokkosKernels::Impl::OneThread2OneChunk; + KokkosKernels::Impl::PoolType pool_type = KokkosKernels::Impl::OneThread2OneChunk; // Determine memory chunk size for UniformMemoryPool size_t mem_chunk_size = hash_size; // for hash indices @@ -254,16 +241,12 @@ void experiment(const parameters_t& params) { // KokkosKernels::Impl::UniformMemoryPool m_space(mem_chunk_count, mem_chunk_size, -1, pool_type); - uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1, - pool_type); + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1, pool_type); - functorTestHashmapAccumulator - testHashmapAccumulator(num_entries, d_data, memory_pool, hash_size, - max_hash_entries, params); + functorTestHashmapAccumulator testHashmapAccumulator( + num_entries, d_data, memory_pool, hash_size, max_hash_entries, params); - Kokkos::parallel_for("testHashmapAccumulator", num_entries, - testHashmapAccumulator); + Kokkos::parallel_for("testHashmapAccumulator", num_entries, testHashmapAccumulator); if (params.verbose) { double t = timer.seconds(); @@ -275,8 +258,7 @@ void experiment(const parameters_t& params) { } // namespace Experiment } // namespace KokkosKernels -void print_options(std::ostream& os, const char* app_name, - unsigned int indent = 0) { +void print_options(std::ostream& os, const char* app_name, unsigned int indent = 0) { std::string spaces(indent, ' '); os << "Usage:" << std::endl << spaces << " " << app_name << " [parameters]" << std::endl @@ -285,15 +267,12 @@ void print_options(std::ostream& os, const char* app_name, << spaces << " Parallelism (select one of the following):" << std::endl << spaces << " --serial Execute serially." << std::endl << spaces << " --threads Use N posix threads." << std::endl - << spaces << " --openmp Use OpenMP with N threads." - << std::endl + << spaces << " --openmp Use OpenMP with N threads." << std::endl << spaces << " --cuda Use CUDA" << std::endl << spaces << " Optional Parameters:" << std::endl - << spaces << " --problem-size Problem Size (Default: 20)" - << std::endl + << spaces << " --problem-size Problem Size (Default: 20)" << std::endl << spaces << " --verbose Verbose output" << std::endl - << spaces << " --help Print out command line help." - << std::endl + << spaces << " --help Print out command line help." << std::endl << spaces << " " << std::endl; } // print_options @@ -321,19 +300,16 @@ int parse_inputs(parameters_t& params, int argc, char** argv) { } else if (0 == Test::string_compare_no_case(argv[i], "--verbose") || 0 == Test::string_compare_no_case(argv[i], "-V")) { params.verbose = true; - } else if (0 == Test::string_compare_no_case(argv[i], "help") || - 0 == Test::string_compare_no_case(argv[i], "-h")) { + } else if (0 == Test::string_compare_no_case(argv[i], "help") || 0 == Test::string_compare_no_case(argv[i], "-h")) { print_options(std::cout, argv[0]); return 1; } else { - std::cerr << "3-Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; print_options(std::cout, argv[0]); return 1; } } - if (!params.use_serial && !params.use_threads && !params.use_openmp && - !params.use_cuda) { + if (!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) { print_options(std::cout, argv[0]); return 1; } @@ -351,14 +327,11 @@ int main(int argc, char* argv[]) { return 1; } - const int device_id = 0; - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads + const int device_id = 0; + const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided + // as number of threads - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); + Kokkos::initialize(Kokkos::InitializationSettings().set_num_threads(num_threads).set_device_id(device_id)); if (params.verbose) { Kokkos::print_configuration(std::cout); diff --git a/example/wiki/blas/abs/abs.cpp b/example/wiki/blas/abs/abs.cpp index c5a1d39e15..a74d4e3555 100644 --- a/example/wiki/blas/abs/abs.cpp +++ b/example/wiki/blas/abs/abs.cpp @@ -29,8 +29,7 @@ int main(int argc, char* argv[]) { double sum = 0.0; Kokkos::parallel_reduce( - "CheckValue", N, - KOKKOS_LAMBDA(const int& i, double& lsum) { lsum += y(i); }, sum); + "CheckValue", N, KOKKOS_LAMBDA(const int& i, double& lsum) { lsum += y(i); }, sum); printf("Sum: %lf Expected: %lf Diff: %e\n", sum, 1.0 * N, sum - 1.0 * N); diff --git a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp index 57f109f652..2137bf09e5 100644 --- a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp +++ b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp @@ -33,8 +33,8 @@ using DeviceSpace = typename ExecSpace::memory_space; using Kokkos::HostSpace; using RowmapType = Kokkos::View; using ColindsType = Kokkos::View; -using Handle = KokkosKernels::Experimental::KokkosKernelsHandle< - Offset, Ordinal, default_scalar, ExecSpace, DeviceSpace, DeviceSpace>; +using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; namespace GraphDemo { Ordinal gridX = 15; @@ -124,10 +124,8 @@ void generate9pt(RowmapType& rowmapDevice, ColindsType& colindsDevice) { Offset numEdges = colinds.size(); // Now that the graph is formed, copy rowmap and colinds to Kokkos::Views in // device memory The nonowning host views just alias the std::vectors. - Kokkos::View> - rowmapHost(rowmap.data(), numVertices + 1); - Kokkos::View> - colindsHost(colinds.data(), numEdges); + Kokkos::View> rowmapHost(rowmap.data(), numVertices + 1); + Kokkos::View> colindsHost(colinds.data(), numEdges); // Allocate owning views on device with the correct size. rowmapDevice = RowmapType("Rowmap", numVertices + 1); colindsDevice = ColindsType("Colinds", numEdges); diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp index 027ee0a057..409564a334 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp @@ -29,9 +29,8 @@ int main() { { std::cout << "Coarsened vertex labels:\n"; Ordinal numClusters = 0; - auto labels = - KokkosGraph::graph_mis2_aggregate( - rowmapDevice, colindsDevice, numClusters); + auto labels = KokkosGraph::graph_mis2_aggregate(rowmapDevice, colindsDevice, + numClusters); // coarsening labels can be printed in the same way as colors GraphDemo::printColoring(labels, numClusters); putchar('\n'); diff --git a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp index ac62861e12..8ff0f6941d 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coloring.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coloring.cpp @@ -42,10 +42,9 @@ int main() { // Use the default algorithm (chosen based on ExecSpace) handle.create_graph_coloring_handle(KokkosGraph::COLORING_DEFAULT); // Run coloring (graph is square and symmetric) - KokkosGraph::Experimental::graph_color(&handle, numVertices, numVertices, - rowmapDevice, colindsDevice); + KokkosGraph::Experimental::graph_color(&handle, numVertices, numVertices, rowmapDevice, colindsDevice); // Get the colors array, and the number of colors used from the handle. - auto colors = handle.get_graph_coloring_handle()->get_vertex_colors(); + auto colors = handle.get_graph_coloring_handle()->get_vertex_colors(); Ordinal numColors = handle.get_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-1 Colors (used %d):\n", (int)numColors); GraphDemo::printColoring(colors, numColors); @@ -57,16 +56,12 @@ int main() { { Handle handle; // Use the default algorithm (chosen based on ExecSpace) - handle.create_distance2_graph_coloring_handle( - KokkosGraph::COLORING_D2_DEFAULT); + handle.create_distance2_graph_coloring_handle(KokkosGraph::COLORING_D2_DEFAULT); // Run coloring - KokkosGraph::Experimental::graph_color_distance2( - &handle, numVertices, rowmapDevice, colindsDevice); + KokkosGraph::Experimental::graph_color_distance2(&handle, numVertices, rowmapDevice, colindsDevice); // Get the colors array, and the number of colors used from the handle. - auto colors = - handle.get_distance2_graph_coloring_handle()->get_vertex_colors(); - Ordinal numColors = - handle.get_distance2_graph_coloring_handle()->get_num_colors(); + auto colors = handle.get_distance2_graph_coloring_handle()->get_vertex_colors(); + Ordinal numColors = handle.get_distance2_graph_coloring_handle()->get_num_colors(); printf("9-pt stencil: Distance-2 Colors (used %d):\n", (int)numColors); GraphDemo::printColoring(colors, numColors); putchar('\n'); diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp index 773930682f..2ee304d249 100644 --- a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp @@ -29,19 +29,16 @@ int main() { // algorithms { // Run coloring - auto misDevice = - KokkosGraph::graph_d2_mis( - rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST); - std::cout << "Distance-2 MIS, FAST algorithm: contains " - << misDevice.extent(0) << " out of " << GraphDemo::numVertices - << " vertices.\n"; + auto misDevice = KokkosGraph::graph_d2_mis(rowmapDevice, colindsDevice, + KokkosGraph::MIS2_FAST); + std::cout << "Distance-2 MIS, FAST algorithm: contains " << misDevice.extent(0) << " out of " + << GraphDemo::numVertices << " vertices.\n"; GraphDemo::printMIS(misDevice); putchar('\n'); - misDevice = KokkosGraph::graph_d2_mis( - rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY); - std::cout << "Distance-2 MIS, QUALITY algorithm: contains " - << misDevice.extent(0) << " out of " << GraphDemo::numVertices - << " vertices.\n"; + misDevice = KokkosGraph::graph_d2_mis(rowmapDevice, colindsDevice, + KokkosGraph::MIS2_QUALITY); + std::cout << "Distance-2 MIS, QUALITY algorithm: contains " << misDevice.extent(0) << " out of " + << GraphDemo::numVertices << " vertices.\n"; GraphDemo::printMIS(misDevice); putchar('\n'); } diff --git a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp index d23a7de233..29fdf61312 100644 --- a/example/wiki/graph/KokkosGraph_wiki_rcm.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_rcm.cpp @@ -17,19 +17,14 @@ #include "KokkosGraph_RCM.hpp" template -void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, - const labels_t& invPermIn) { +void printReorderedMatrix(const rowmap_t& rowmapIn, const entries_t& entriesIn, const labels_t& invPermIn) { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; - auto rowmap = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn); - auto entries = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn); - auto invPerm = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), invPermIn); - lno_t numVerts = rowmap.extent(0) - 1; - decltype(invPerm) perm( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Perm"), numVerts); + auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn); + auto invPerm = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), invPermIn); + lno_t numVerts = rowmap.extent(0) - 1; + decltype(invPerm) perm(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Perm"), numVerts); for (lno_t i = 0; i < numVerts; i++) perm(invPerm(i)) = i; std::vector neighbors; for (lno_t i = 0; i < numVerts; i++) { @@ -68,9 +63,7 @@ int main() { // Step 2: Run RCM and print the reordered matrix { auto rcmDevice = - KokkosGraph::Experimental::graph_rcm(rowmapDevice, - colindsDevice); + KokkosGraph::Experimental::graph_rcm(rowmapDevice, colindsDevice); std::cout << "Graph reordered by reverse Cuthill-McKee:\n"; printReorderedMatrix(rowmapDevice, colindsDevice, rcmDevice); } diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp index eacf134f89..49721e595e 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp @@ -31,19 +31,14 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; - using b_matrix_type = - typename KokkosSparse::Experimental::BsrMatrix; - using graph_type = typename matrix_type::staticcrsgraph_type; - using row_map_type = typename graph_type::row_map_type; - using entries_type = typename graph_type::entries_type; - using values_type = typename matrix_type::values_type; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; + using b_matrix_type = typename KokkosSparse::Experimental::BsrMatrix; + using graph_type = typename matrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; + using values_type = typename matrix_type::values_type; const Scalar SC_ONE = Kokkos::ArithTraits::one(); @@ -70,8 +65,7 @@ int main() { { // Build the row pointers and store numNNZ - typename row_map_type::HostMirror row_map_h = - Kokkos::create_mirror_view(row_map); + typename row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(row_map); for (Ordinal rowIdx = 1; rowIdx < numRows + 1; ++rowIdx) { if ((rowIdx == 1) || (rowIdx == numRows)) { row_map_h(rowIdx) = row_map_h(rowIdx - 1) + 2; @@ -82,15 +76,13 @@ int main() { Kokkos::deep_copy(row_map, row_map_h); if (row_map_h(numRows) != numNNZ) { std::ostringstream error_msg; - error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" - << row_map_h(numRows) << ", numNNZ=" << numNNZ; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" << row_map_h(numRows) + << ", numNNZ=" << numNNZ; throw std::runtime_error(error_msg.str()); } - typename entries_type::HostMirror entries_h = - Kokkos::create_mirror_view(entries); - typename values_type::HostMirror values_h = - Kokkos::create_mirror_view(values); + typename entries_type::HostMirror entries_h = Kokkos::create_mirror_view(entries); + typename values_type::HostMirror values_h = Kokkos::create_mirror_view(values); for (Ordinal rowIdx = 0; rowIdx < numRows; ++rowIdx) { if (rowIdx == 0) { entries_h(row_map_h(rowIdx)) = rowIdx; diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp index 7ff56ff14a..527b0d56c4 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp @@ -43,8 +43,7 @@ struct bsr_fill { block_tmp(1, 0) = 0.0; block_tmp(1, 1) = 1.0; } else if (rowIdx == bsr_mat.numRows() - 1) { // Right boundary condition - auto block_tmp = - bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + auto block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); block_tmp(0, 0) = 1.0; block_tmp(1, 1) = 1.0; } else { @@ -54,13 +53,13 @@ struct bsr_fill { block_tmp(1, 0) = 0.0; block_tmp(1, 1) = -1.0; - block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1); block_tmp(0, 0) = 2.0; block_tmp(0, 1) = 0.0; block_tmp(1, 0) = 0.0; block_tmp(1, 1) = 2.0; - block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 2); + block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 2); block_tmp(0, 0) = -1.0; block_tmp(0, 1) = 1.0 / 2.0; block_tmp(1, 0) = 0.0; @@ -89,8 +88,7 @@ struct diagonal_extractor { KOKKOS_INLINE_FUNCTION void operator()(const int& rowIdx) const { - for (Offset entryIdx = row_map(rowIdx); entryIdx < row_map(rowIdx + 1); - ++entryIdx) { + for (Offset entryIdx = row_map(rowIdx); entryIdx < row_map(rowIdx + 1); ++entryIdx) { if (entries(entryIdx) == rowIdx) { bsr_block_type bsr_diag_block = bsr_mat.unmanaged_block(entryIdx); for (int i = 0; i < bsr_mat.blockDim(); ++i) { @@ -104,15 +102,12 @@ struct diagonal_extractor { }; int main(int argc, char* argv[]) { - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using bsrmatrix_type = - typename KokkosSparse::Experimental::BsrMatrix; - using graph_type = typename bsrmatrix_type::staticcrsgraph_type; - using row_map_type = typename graph_type::row_map_type; - using entries_type = typename graph_type::entries_type; + using device_type = + typename Kokkos::Device; + using bsrmatrix_type = typename KokkosSparse::Experimental::BsrMatrix; + using graph_type = typename bsrmatrix_type::staticcrsgraph_type; + using row_map_type = typename graph_type::row_map_type; + using entries_type = typename graph_type::entries_type; Kokkos::initialize(argc, argv); { @@ -143,16 +138,12 @@ int main(int argc, char* argv[]) { bsrmatrix_type bsr_mat; { - typename row_map_type::non_const_type row_map( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "row pointers"), - numRows + 1); - typename entries_type::non_const_type entries( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "column indices"), - numNNZ); - typename row_map_type::HostMirror row_map_h = - Kokkos::create_mirror_view(row_map); - typename entries_type::HostMirror entries_h = - Kokkos::create_mirror_view(entries); + typename row_map_type::non_const_type row_map(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row pointers"), + numRows + 1); + typename entries_type::non_const_type entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "column indices"), + numNNZ); + typename row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(row_map); + typename entries_type::HostMirror entries_h = Kokkos::create_mirror_view(entries); // First Step: build the CrsGraph { @@ -181,8 +172,8 @@ int main(int argc, char* argv[]) { if (row_map_h(numRows) != numNNZ) { std::ostringstream error_msg; - error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" - << row_map_h(numRows) << ", numNNZ=" << numNNZ; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" << row_map_h(numRows) + << ", numNNZ=" << numNNZ; throw std::runtime_error(error_msg.str()); } Kokkos::deep_copy(row_map, row_map_h); @@ -204,16 +195,13 @@ int main(int argc, char* argv[]) { std::cout << " "; } std::cout << "*"; - for (Offset entryIdx = row_map_h(rowIdx); - entryIdx < row_map_h(rowIdx + 1) - 1; ++entryIdx) { - for (int colIdx = entries_h(entryIdx) + 1; - colIdx < entries_h(entryIdx + 1); ++colIdx) { + for (Offset entryIdx = row_map_h(rowIdx); entryIdx < row_map_h(rowIdx + 1) - 1; ++entryIdx) { + for (int colIdx = entries_h(entryIdx) + 1; colIdx < entries_h(entryIdx + 1); ++colIdx) { std::cout << " "; } std::cout << "*"; } - for (int colIdx = entries_h(row_map_h(rowIdx + 1) - 1) + 1; - colIdx < numRows; ++colIdx) { + for (int colIdx = entries_h(row_map_h(rowIdx + 1) - 1) + 1; colIdx < numRows; ++colIdx) { std::cout << " "; } std::cout << "]" << std::endl; @@ -221,24 +209,17 @@ int main(int argc, char* argv[]) { } // Extract diagonal block and store them in a rank-3 view - using diag_blocks_type = - Kokkos::View; - diag_blocks_type diag_blocks("diagonal blocks", numRows, blockSize, - blockSize); + using diag_blocks_type = Kokkos::View; + diag_blocks_type diag_blocks("diagonal blocks", numRows, blockSize, blockSize); diagonal_extractor myFunc(bsr_mat, diag_blocks); Kokkos::parallel_for(Kokkos::RangePolicy(0, numRows), myFunc); - auto diag_blocks_h = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, diag_blocks); + auto diag_blocks_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, diag_blocks); std::cout << "\nBsrMatrix diagonal blocks: " << std::endl; for (int blockId = 0; blockId < diag_blocks_h.extent_int(0); ++blockId) { - std::cout << " [" << diag_blocks_h(blockId, 0, 0) << ", " - << diag_blocks_h(blockId, 0, 1) << "]" << std::endl; - std::cout << " [" << diag_blocks_h(blockId, 1, 0) << ", " - << diag_blocks_h(blockId, 1, 1) << "]\n" - << std::endl; + std::cout << " [" << diag_blocks_h(blockId, 0, 0) << ", " << diag_blocks_h(blockId, 0, 1) << "]" << std::endl; + std::cout << " [" << diag_blocks_h(blockId, 1, 0) << ", " << diag_blocks_h(blockId, 1, 1) << "]\n" << std::endl; } } Kokkos::finalize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp b/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp index c8d6c805c1..21257d8034 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp @@ -29,12 +29,9 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; using graph_type = typename matrix_type::staticcrsgraph_type; using row_map_type = typename graph_type::row_map_type; using entries_type = typename graph_type::entries_type; @@ -52,8 +49,7 @@ int main() { { // Build the row pointers and store numNNZ - typename row_map_type::HostMirror row_map_h = - Kokkos::create_mirror_view(row_map); + typename row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(row_map); for (Ordinal rowIdx = 1; rowIdx < numRows + 1; ++rowIdx) { if ((rowIdx == 1) || (rowIdx == numRows)) { row_map_h(rowIdx) = row_map_h(rowIdx - 1) + 2; @@ -64,15 +60,13 @@ int main() { Kokkos::deep_copy(row_map, row_map_h); if (row_map_h(numRows) != numNNZ) { std::ostringstream error_msg; - error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" - << row_map_h(numRows) << ", numNNZ=" << numNNZ; + error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)=" << row_map_h(numRows) + << ", numNNZ=" << numNNZ; throw std::runtime_error(error_msg.str()); } - typename entries_type::HostMirror entries_h = - Kokkos::create_mirror_view(entries); - typename values_type::HostMirror values_h = - Kokkos::create_mirror_view(values); + typename entries_type::HostMirror entries_h = Kokkos::create_mirror_view(entries); + typename values_type::HostMirror values_h = Kokkos::create_mirror_view(values); for (Ordinal rowIdx = 0; rowIdx < numRows; ++rowIdx) { if (rowIdx == 0) { entries_h(row_map_h(rowIdx)) = rowIdx; diff --git a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp index 3dd8bfd5e5..31ccea3b0a 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp @@ -37,10 +37,10 @@ int main() { using ExecSpace = Kokkos::DefaultExecutionSpace; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - using Handle = KokkosKernels::Experimental::KokkosKernelsHandle< - Offset, Ordinal, default_scalar, ExecSpace, MemSpace, MemSpace>; - using Matrix = KokkosSparse::CrsMatrix; - using Vector = typename Matrix::values_type; + using Handle = + KokkosKernels::Experimental::KokkosKernelsHandle; + using Matrix = KokkosSparse::CrsMatrix; + using Vector = typename Matrix::values_type; constexpr Ordinal numRows = 10000; const Scalar one = Kokkos::ArithTraits::one(); const Mag magOne = Kokkos::ArithTraits::one(); @@ -52,32 +52,28 @@ int main() { // on which Gauss-Seidel should converge. Get approx. 20 entries per row // Diagonals are 2x the absolute sum of all other entries. Offset nnz = numRows * 20; - Matrix A = - KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< - Matrix>(numRows, numRows, nnz, 2, 100, 1.05 * one); - std::cout << "Generated a matrix with " << numRows << " rows/cols, and " - << nnz << " entries.\n"; + Matrix A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix(numRows, numRows, nnz, 2, 100, + 1.05 * one); + std::cout << "Generated a matrix with " << numRows << " rows/cols, and " << nnz << " entries.\n"; // Create a kernel handle, then a Gauss-Seidel handle with the default // algorithm Handle handle; handle.create_gs_handle(KokkosSparse::GS_DEFAULT); // Set up Gauss-Seidel for the graph (matrix sparsity pattern) - KokkosSparse::Experimental::gauss_seidel_symbolic( - &handle, numRows, numRows, A.graph.row_map, A.graph.entries, false); + KokkosSparse::Experimental::gauss_seidel_symbolic(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, + false); // Set up Gauss-Seidel for the matrix values (numeric) // Another matrix with the same sparsity pattern could re-use the handle and // symbolic phase, and only call numeric. - KokkosSparse::Experimental::gauss_seidel_numeric( - &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, - false); + KokkosSparse::Experimental::gauss_seidel_numeric(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, + A.values, false); // Now, preconditioner is ready to use. Set up an unknown vector // (uninitialized) and randomized right-hand-side vector. Vector x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), numRows); Vector b(Kokkos::view_alloc(Kokkos::WithoutInitializing, "b"), numRows); Vector res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "res"), numRows); auto bHost = Kokkos::create_mirror_view(b); - for (Ordinal i = 0; i < numRows; i++) - bHost(i) = 3 * ((one * rand()) / RAND_MAX); + for (Ordinal i = 0; i < numRows; i++) bHost(i) = 3 * ((one * rand()) / RAND_MAX); Kokkos::deep_copy(b, bHost); // Measure initial residual norm ||Ax - b||, where x is 0 Mag initialRes = KokkosBlas::nrm2(b); @@ -92,8 +88,7 @@ int main() { // * that b has changed since the previous apply (since there was no // previous apply) KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( - &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, - x, b, firstIter, firstIter, one, 1); + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, x, b, firstIter, firstIter, one, 1); firstIter = false; // Now, compute the new residual norm using SPMV Kokkos::deep_copy(res, b); @@ -102,8 +97,7 @@ int main() { // Recompute the scaled norm scaledResNorm = KokkosBlas::nrm2(res) / initialRes; numIters++; - std::cout << "Iteration " << numIters - << " scaled residual norm: " << scaledResNorm << '\n'; + std::cout << "Iteration " << numIters << " scaled residual norm: " << scaledResNorm << '\n'; } std::cout << "SUCCESS: converged in " << numIters << " iterations.\n"; } diff --git a/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp b/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp index 841e3b9eb3..c9edd7bc0c 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp @@ -28,14 +28,11 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; + using device_type = + typename Kokkos::Device; using execution_space = typename device_type::execution_space; using memory_space = typename device_type::memory_space; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using matrix_type = typename KokkosSparse::CrsMatrix; int return_value = 0; @@ -47,8 +44,7 @@ int main() { // In each row the first entry is the number of grid point in // that direction, the second and third entries are used to apply // BCs in that direction. - Kokkos::View mat_structure( - "Matrix Structure", 2); + Kokkos::View mat_structure("Matrix Structure", 2); mat_structure(0, 0) = 10; // Request 10 grid point in 'x' direction mat_structure(0, 1) = 1; // Add BC to the left mat_structure(0, 2) = 1; // Add BC to the right @@ -56,15 +52,13 @@ int main() { mat_structure(1, 1) = 1; // Add BC to the bottom mat_structure(1, 2) = 1; // Add BC to the top - matrix_type A = - Test::generate_structured_matrix2D("FD", mat_structure); - matrix_type B = - Test::generate_structured_matrix2D("FE", mat_structure); + matrix_type A = Test::generate_structured_matrix2D("FD", mat_structure); + matrix_type B = Test::generate_structured_matrix2D("FE", mat_structure); matrix_type C; // Create KokkosKernelHandle - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - Offset, Ordinal, Scalar, execution_space, memory_space, memory_space>; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; KernelHandle kh; kh.create_spadd_handle(false); diff --git a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp index 56a628ffd5..2b3ccd13d2 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp @@ -28,12 +28,9 @@ using Layout = default_layout; int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; int return_value = 0; @@ -45,8 +42,7 @@ int main() { // In each row the first entry is the number of grid point in // that direction, the second and third entries are used to apply // BCs in that direction. - Kokkos::View mat_structure( - "Matrix Structure", 2); + Kokkos::View mat_structure("Matrix Structure", 2); mat_structure(0, 0) = 10; // Request 10 grid point in 'x' direction mat_structure(0, 1) = 1; // Add BC to the left mat_structure(0, 2) = 1; // Add BC to the right @@ -54,15 +50,13 @@ int main() { mat_structure(1, 1) = 1; // Add BC to the bottom mat_structure(1, 2) = 1; // Add BC to the top - matrix_type A = - Test::generate_structured_matrix2D("FD", mat_structure); - matrix_type B = - Test::generate_structured_matrix2D("FE", mat_structure); + matrix_type A = Test::generate_structured_matrix2D("FD", mat_structure); + matrix_type B = Test::generate_structured_matrix2D("FE", mat_structure); matrix_type C = KokkosSparse::spgemm(A, false, B, false); - std::cout << "Ran spgemm: product C is " << C.numRows() << 'x' - << C.numCols() << " and has " << C.nnz() << " nonzeros.\n"; + std::cout << "Ran spgemm: product C is " << C.numRows() << 'x' << C.numCols() << " and has " << C.nnz() + << " nonzeros.\n"; } Kokkos::finalize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp b/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp index 8b876e5bfc..5778684a8a 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp @@ -44,12 +44,9 @@ struct check_spmv_functor { int main() { Kokkos::initialize(); - using device_type = typename Kokkos::Device< - Kokkos::DefaultExecutionSpace, - typename Kokkos::DefaultExecutionSpace::memory_space>; - using matrix_type = - typename KokkosSparse::CrsMatrix; + using device_type = + typename Kokkos::Device; + using matrix_type = typename KokkosSparse::CrsMatrix; using values_type = typename matrix_type::values_type; int return_value = 0; @@ -66,8 +63,7 @@ int main() { // BCs in that direction, BC=0 means Neumann BC is applied, // BC=1 means Dirichlet BC is applied by zeroing out the row and putting // one on the diagonal. - Kokkos::View mat_structure( - "Matrix Structure", 2); + Kokkos::View mat_structure("Matrix Structure", 2); mat_structure(0, 0) = 10; // Request 10 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -75,8 +71,7 @@ int main() { mat_structure(1, 1) = 0; // Add BC to the bottom mat_structure(1, 2) = 0; // Add BC to the top - matrix_type myMatrix = - Test::generate_structured_matrix2D("FD", mat_structure); + matrix_type myMatrix = Test::generate_structured_matrix2D("FD", mat_structure); const Ordinal numRows = myMatrix.numRows(); @@ -92,15 +87,12 @@ int main() { Ordinal count_errors = 0; check_spmv_functor check_spmv(y); - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, numRows), - check_spmv, count_errors); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, numRows), check_spmv, count_errors); if (count_errors > 0) { return_value = 1; - std::cout << "Found " << count_errors << " errors in y vector!" - << std::endl; + std::cout << "Found " << count_errors << " errors in y vector!" << std::endl; } else { - std::cout << "spmv was performed correctly: y = beta*y + alpha*A*x" - << std::endl; + std::cout << "spmv was performed correctly: y = beta*y + alpha*A*x" << std::endl; } } diff --git a/graph/impl/KokkosGraph_BFS_impl.hpp b/graph/impl/KokkosGraph_BFS_impl.hpp index 9ea5d63e07..34cb3c9179 100644 --- a/graph/impl/KokkosGraph_BFS_impl.hpp +++ b/graph/impl/KokkosGraph_BFS_impl.hpp @@ -39,10 +39,8 @@ struct SerialRCM { SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) : numVerts(std::max(rowmap_.extent_int(0), 1) - 1), - rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"), - rowmap_.extent(0)), - entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"), - entries_.extent(0)) { + rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"), rowmap_.extent(0)), + entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"), entries_.extent(0)) { Kokkos::deep_copy(rowmap, rowmap_); Kokkos::deep_copy(entries, entries_); } @@ -51,11 +49,8 @@ struct SerialRCM { // Given a label L, labelReverse - L gives the reversed label (as in reverse // Cuthill McKee) lno_t labelReverse = numVerts - 1; - host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"), - numVerts); - host_lno_view_t label( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"), - numVerts); + host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"), numVerts); + host_lno_view_t label(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"), numVerts); for (lno_t i = 0; i < numVerts; i++) label(i) = -1; lno_t qhead = 0; lno_t qtail = 0; @@ -63,16 +58,12 @@ struct SerialRCM { // (heuristic for best to worst starting vertex for RCM). // If the graph has multiple connected components, restart at the first // unlabeled vertex in this list. - host_lno_view_t allVertices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "allVertices"), - numVerts); + host_lno_view_t allVertices(Kokkos::view_alloc(Kokkos::WithoutInitializing, "allVertices"), numVerts); for (lno_t i = 0; i < numVerts; i++) allVertices(i) = i; - std::sort(allVertices.data(), allVertices.data() + numVerts, - [&](lno_t n1, lno_t n2) -> bool { - // return true if n1 has a lower degree than n2 - return (rowmap(n1 + 1) - rowmap(n1)) < - (rowmap(n2 + 1) - rowmap(n2)); - }); + std::sort(allVertices.data(), allVertices.data() + numVerts, [&](lno_t n1, lno_t n2) -> bool { + // return true if n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2)); + }); lno_t allVerticesIter = 0; // Start RCM with the first vertex in allVertices lno_t start = allVertices(allVerticesIter++); @@ -90,12 +81,10 @@ struct SerialRCM { neighbors.push_back(nei); } } - std::sort(neighbors.begin(), neighbors.end(), - [&](lno_t n1, lno_t n2) -> bool { - // return true if n1 has a lower degree than n2 - return (rowmap(n1 + 1) - rowmap(n1)) < - (rowmap(n2 + 1) - rowmap(n2)); - }); + std::sort(neighbors.begin(), neighbors.end(), [&](lno_t n1, lno_t n2) -> bool { + // return true if n1 has a lower degree than n2 + return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2)); + }); // label and enqueue all unlabeled neighbors for (lno_t nei : neighbors) { label(nei) = labelReverse - qtail; @@ -112,9 +101,7 @@ struct SerialRCM { q(qtail++) = restart; } } - lno_view_t labelOut( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"), - numVerts); + lno_view_t labelOut(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"), numVerts); Kokkos::deep_copy(labelOut, label); return labelOut; } diff --git a/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 6bd1c022ae..2abc5c76e4 100644 --- a/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -36,8 +36,7 @@ namespace Impl { * General aim is to find the minimum number of colors, minimum number of * independent sets. */ -template +template class GraphColor { public: typedef lno_row_view_t_ in_lno_row_view_t; @@ -49,19 +48,15 @@ class GraphColor { typedef typename HandleType::size_type size_type; typedef typename HandleType::nnz_lno_t nnz_lno_t; - typedef typename in_lno_row_view_t::HostMirror - row_lno_host_view_t; // Host view type + typedef typename in_lno_row_view_t::HostMirror row_lno_host_view_t; // Host view type - typedef typename in_lno_nnz_view_t::HostMirror - nnz_lno_host_view_t; // Host view type + typedef typename in_lno_nnz_view_t::HostMirror nnz_lno_host_view_t; // Host view type - typedef typename HandleType::color_host_view_t - color_host_view_t; // Host view type + typedef typename HandleType::color_host_view_t color_host_view_t; // Host view type typedef typename HandleType::HandleExecSpace MyExecSpace; typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef - typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; typedef typename HandleType::const_size_type const_size_type; typedef typename lno_row_view_t_::const_type const_lno_row_view_t; @@ -70,8 +65,8 @@ class GraphColor { typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t; protected: - nnz_lno_t nv; //# vertices - size_type ne; //# edges + nnz_lno_t nv; // # vertices + size_type ne; // # edges const_lno_row_view_t xadj; // rowmap const_lno_nnz_view_t adj; // entries const_lno_nnz_view_t kok_src, kok_dst; // Edge list storage of the graph @@ -87,25 +82,13 @@ class GraphColor { * \param coloring_handle: GraphColoringHandle object that holds the * specification about the graph coloring, including parameters. */ - GraphColor(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, - const_lno_nnz_view_t entries, HandleType *coloring_handle) - : nv(nv_), - ne(ne_), - xadj(row_map), - adj(entries), - kok_src(), - kok_dst(), - cp(coloring_handle) { - static_assert( - std::is_same< - size_type, - typename const_lno_row_view_t::non_const_value_type>::value, - "Row map element type does not match handle's size_type."); - static_assert( - std::is_same< - nnz_lno_t, - typename const_lno_nnz_view_t::non_const_value_type>::value, - "Entries element type does not match handle's nnz_lno_t."); + GraphColor(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, const_lno_nnz_view_t entries, + HandleType *coloring_handle) + : nv(nv_), ne(ne_), xadj(row_map), adj(entries), kok_src(), kok_dst(), cp(coloring_handle) { + static_assert(std::is_same::value, + "Row map element type does not match handle's size_type."); + static_assert(std::is_same::value, + "Entries element type does not match handle's nnz_lno_t."); } /** \brief GraphColor destructor. @@ -125,11 +108,9 @@ class GraphColor { virtual void color_graph(color_view_t d_colors, int &num_phases) { num_phases = 1; - color_host_view_t colors = Kokkos::create_mirror_view(d_colors); - typename const_lno_row_view_t::HostMirror h_xadj = - Kokkos::create_mirror_view(this->xadj); - typename const_lno_nnz_view_t::HostMirror h_adj = - Kokkos::create_mirror_view(this->adj); + color_host_view_t colors = Kokkos::create_mirror_view(d_colors); + typename const_lno_row_view_t::HostMirror h_xadj = Kokkos::create_mirror_view(this->xadj); + typename const_lno_nnz_view_t::HostMirror h_adj = Kokkos::create_mirror_view(this->adj); // typename nnz_lno_host_view_t::HostMirror::HostMirror::HostMirror h_adj = // tmp; @@ -185,10 +166,8 @@ class GraphColor { * based algorithms. VBCS: Speculative parallel vertex based using color set * implementation. */ -template -class GraphColor_VB - : public GraphColor { +template +class GraphColor_VB : public GraphColor { public: typedef long long int ban_type; @@ -202,32 +181,24 @@ class GraphColor_VB typedef typename HandleType::nnz_lno_t nnz_lno_t; typedef typename HandleType::color_t color_t; - typedef typename HandleType::color_host_view_t - color_host_view_t; // Host view type + typedef typename HandleType::color_host_view_t color_host_view_t; // Host view type typedef typename HandleType::HandleExecSpace MyExecSpace; typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef - typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - typedef typename Kokkos::View - single_dim_index_view_type; + typedef typename Kokkos::View single_dim_index_view_type; // typedef typename Kokkos::View // um_array_type; - typedef typename single_dim_index_view_type::HostMirror - single_dim_index_host_view_type; // Host view type + typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; // Host view type typedef Kokkos::RangePolicy my_exec_space; - typedef typename HandleType::size_type_temp_work_view_t - size_type_temp_work_view_t; - typedef typename HandleType::size_type_persistent_work_view_t - size_type_persistent_work_view_t; + typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t; + typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t; - typedef - typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; @@ -240,21 +211,21 @@ class GraphColor_VB bool _serialConflictResolution; // if true use serial conflict resolution bool _ticToc; // if true print info in each step - ConflictList _conflict_scheme; // Enum: COLORING_NOCONFLICT, COLORING_ATOMIC, - // COLORING_PPS + ConflictList _conflict_scheme; // Enum: COLORING_NOCONFLICT, COLORING_ATOMIC, + // COLORING_PPS - double _pps_ratio; // the minimum number of reduction on the size of the - // conflictlist to create a new conflictlist + double _pps_ratio; // the minimum number of reduction on the size of the + // conflictlist to create a new conflictlist nnz_lno_t _min_vertex_cut_off; // minimum number of vertices to reduce the // conflictlist further. - bool _edge_filtering; // if true, edge-filtering is applied by swaps on - // adjacency array. - int _chunkSize; // the size of the minimum work unit assigned to threads. - // Changes the convergence on GPUs - char _use_color_set; // the VB algorithm type. - // 0 for VB: - // 1: for VBCS - // 2: for VBBIT + bool _edge_filtering; // if true, edge-filtering is applied by swaps on + // adjacency array. + int _chunkSize; // the size of the minimum work unit assigned to threads. + // Changes the convergence on GPUs + char _use_color_set; // the VB algorithm type. + // 0 for VB: + // 1: for VBCS + // 2: for VBBIT int _max_num_iterations; @@ -268,17 +239,14 @@ class GraphColor_VB * \param coloring_handle: GraphColoringHandle object that holds the * specification about the graph coloring, including parameters. */ - GraphColor_VB(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, - const_lno_nnz_view_t entries, HandleType *coloring_handle) - : GraphColor( - nv_, ne_, row_map, entries, coloring_handle), - _serialConflictResolution( - coloring_handle->get_serial_conflict_resolution()), + GraphColor_VB(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, const_lno_nnz_view_t entries, + HandleType *coloring_handle) + : GraphColor(nv_, ne_, row_map, entries, coloring_handle), + _serialConflictResolution(coloring_handle->get_serial_conflict_resolution()), _ticToc(coloring_handle->get_tictoc()), _conflict_scheme(coloring_handle->get_conflict_list_type()), _pps_ratio(coloring_handle->get_min_reduction_for_conflictlist()), - _min_vertex_cut_off( - coloring_handle->get_min_elements_for_conflictlist()), + _min_vertex_cut_off(coloring_handle->get_min_elements_for_conflictlist()), _edge_filtering(coloring_handle->get_vb_edge_filtering()), _chunkSize(coloring_handle->get_vb_chunk_size()), _use_color_set(), @@ -309,20 +277,15 @@ class GraphColor_VB virtual void color_graph(color_view_type colors, int &num_loops) { if (this->_ticToc) { std::cout << "\tVB params:" << std::endl - << "\tuseConflictList:" << int(this->_conflict_scheme) - << std::endl + << "\tuseConflictList:" << int(this->_conflict_scheme) << std::endl << "\talgorithm:" << (int)this->_use_color_set << std::endl - << "\tserialConflictResolution:" - << (int)this->_serialConflictResolution << std::endl + << "\tserialConflictResolution:" << (int)this->_serialConflictResolution << std::endl << "\tticToc:" << (int)this->_ticToc << std::endl << "\tuse_color_set:" << (int)this->_use_color_set << std::endl << "\tpps_ratio:" << this->_pps_ratio << std::endl - << "\tmin_vertex_cut_off:" << this->_min_vertex_cut_off - << std::endl - << "\tedge_filtering:" << (int)this->_edge_filtering - << std::endl - << "\tmax_num_iterations:" << this->_max_num_iterations - << std::endl + << "\tmin_vertex_cut_off:" << this->_min_vertex_cut_off << std::endl + << "\tedge_filtering:" << (int)this->_edge_filtering << std::endl + << "\tmax_num_iterations:" << this->_max_num_iterations << std::endl << "\tchunkSize:" << this->_chunkSize << std::endl; } @@ -334,9 +297,7 @@ class GraphColor_VB // We need to copy the adjacency array so that we dont harm the original // one. if (this->_edge_filtering) { - adj_copy = nnz_lno_temp_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), - this->ne); + adj_copy = nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), this->ne); Kokkos::deep_copy(adj_copy, this->adj); } @@ -348,9 +309,8 @@ class GraphColor_VB } // the conflictlist - nnz_lno_temp_work_view_t current_vertexList = nnz_lno_temp_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), - this->nv); + nnz_lno_temp_work_view_t current_vertexList = + nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), this->nv); nnz_lno_t current_vertexListLength = this->nv; if (this->cp->get_use_vtx_list()) { @@ -359,9 +319,8 @@ class GraphColor_VB current_vertexListLength = this->cp->get_vertex_list_size(); } else { // init vertexList sequentially. - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::InitList", my_exec_space(0, this->nv), - functorInitList(current_vertexList)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::InitList", my_exec_space(0, this->nv), + functorInitList(current_vertexList)); } // the next iteration's conflict list @@ -374,11 +333,9 @@ class GraphColor_VB // if a conflictlist is used if (this->_conflict_scheme != COLORING_NOCONFLICT) { // Vertices to recolor. Will swap with vertexList. - next_iteration_recolorList = nnz_lno_temp_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), - this->nv); - next_iteration_recolorListLength = - single_dim_index_view_type("recolorListLength"); + next_iteration_recolorList = + nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), this->nv); + next_iteration_recolorListLength = single_dim_index_view_type("recolorListLength"); } nnz_lno_t numUncolored = this->nv; @@ -394,13 +351,13 @@ class GraphColor_VB if (this->_edge_filtering) { // First color greedy speculatively, // some conflicts expected - this->colorGreedyEF(this->xadj, adj_copy, colors, vertex_color_set, - current_vertexList, current_vertexListLength); + this->colorGreedyEF(this->xadj, adj_copy, colors, vertex_color_set, current_vertexList, + current_vertexListLength); } else { // First color greedy speculatively, // some conflicts expected - this->colorGreedy(this->xadj, this->adj, colors, vertex_color_set, - current_vertexList, current_vertexListLength); + this->colorGreedy(this->xadj, this->adj, colors, vertex_color_set, current_vertexList, + current_vertexListLength); } MyExecSpace().fence(); @@ -408,22 +365,19 @@ class GraphColor_VB if (this->_ticToc) { double t = timer.seconds(); total_time_greedy_phase += t; - std::cout << "\tTime speculative greedy phase " << iter << " : " << t - << std::endl; + std::cout << "\tTime speculative greedy phase " << iter << " : " << t << std::endl; timer.reset(); } bool swap_work_arrays = true; if (this->_edge_filtering) { - numUncolored = this->findConflicts( - swap_work_arrays, this->xadj, adj_copy, colors, vertex_color_set, - current_vertexList, current_vertexListLength, - next_iteration_recolorList, next_iteration_recolorListLength); + numUncolored = + this->findConflicts(swap_work_arrays, this->xadj, adj_copy, colors, vertex_color_set, current_vertexList, + current_vertexListLength, next_iteration_recolorList, next_iteration_recolorListLength); } else { - numUncolored = this->findConflicts( - swap_work_arrays, this->xadj, this->adj, colors, vertex_color_set, - current_vertexList, current_vertexListLength, - next_iteration_recolorList, next_iteration_recolorListLength); + numUncolored = + this->findConflicts(swap_work_arrays, this->xadj, this->adj, colors, vertex_color_set, current_vertexList, + current_vertexListLength, next_iteration_recolorList, next_iteration_recolorListLength); } MyExecSpace().fence(); @@ -431,41 +385,34 @@ class GraphColor_VB if (_ticToc) { double t = timer.seconds(); total_time_find_conflicts += t; - std::cout << "\tTime conflict detection " << iter << " : " << t - << std::endl; + std::cout << "\tTime conflict detection " << iter << " : " << t << std::endl; timer.reset(); } - if (this->_serialConflictResolution) - break; // Break after first iteration. - if (this->_conflict_scheme != COLORING_NOCONFLICT && swap_work_arrays && - (iter + 1 < this->_max_num_iterations)) { + if (this->_serialConflictResolution) break; // Break after first iteration. + if (this->_conflict_scheme != COLORING_NOCONFLICT && swap_work_arrays && (iter + 1 < this->_max_num_iterations)) { // Swap recolorList and vertexList - nnz_lno_temp_work_view_t temp = current_vertexList; - current_vertexList = next_iteration_recolorList; - next_iteration_recolorList = temp; - current_vertexListLength = numUncolored; - next_iteration_recolorListLength = - single_dim_index_view_type("recolorListLength"); + nnz_lno_temp_work_view_t temp = current_vertexList; + current_vertexList = next_iteration_recolorList; + next_iteration_recolorList = temp; + current_vertexListLength = numUncolored; + next_iteration_recolorListLength = single_dim_index_view_type("recolorListLength"); } } // if VBCS algorithm is used, the colors are converted back to original // form. if (this->_use_color_set == 1) { - Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", - my_exec_space(0, this->nv), + Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", my_exec_space(0, this->nv), set_final_colors(colors, vertex_color_set)); } if (numUncolored > 0) { if (this->_edge_filtering) { // Resolve conflicts by recoloring in serial - this->resolveConflicts(this->nv, this->xadj, adj_copy, colors, - current_vertexList, current_vertexListLength); + this->resolveConflicts(this->nv, this->xadj, adj_copy, colors, current_vertexList, current_vertexListLength); } else { // Resolve conflicts by recoloring in serial - this->resolveConflicts(this->nv, this->xadj, this->adj, colors, - current_vertexList, current_vertexListLength); + this->resolveConflicts(this->nv, this->xadj, this->adj, colors, current_vertexList, current_vertexListLength); } MyExecSpace().fence(); if (_ticToc) { @@ -478,8 +425,7 @@ class GraphColor_VB this->cp->add_to_overall_coloring_time_phase1(total_time_greedy_phase); this->cp->add_to_overall_coloring_time_phase2(total_time_find_conflicts); - this->cp->add_to_overall_coloring_time_phase3( - total_time_serial_conflict_resolution); + this->cp->add_to_overall_coloring_time_phase3(total_time_serial_conflict_resolution); } // color_graph (end) private: @@ -491,13 +437,10 @@ class GraphColor_VB * \param current_vertexList_: current conflictlist * \param current_vertexListLength_: size of current conflictlist */ - void colorGreedy(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, - color_view_type vertex_colors_, - nnz_lno_temp_work_view_t vertex_color_set, - nnz_lno_temp_work_view_t current_vertexList_, + void colorGreedy(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, color_view_type vertex_colors_, + nnz_lno_temp_work_view_t vertex_color_set, nnz_lno_temp_work_view_t current_vertexList_, nnz_lno_t current_vertexListLength_) { - nnz_lno_t chunkSize_ = - this->_chunkSize; // Process chunkSize vertices in one chunk + nnz_lno_t chunkSize_ = this->_chunkSize; // Process chunkSize vertices in one chunk if (current_vertexListLength_ < 100 * chunkSize_) chunkSize_ = 1; @@ -505,34 +448,28 @@ class GraphColor_VB if (this->_use_color_set == 2) { // std::cout << ">>> functorGreedyColor_IMPLOG" << std::endl; // // WCMCLEN - functorGreedyColor_IMPLOG gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, + functorGreedyColor_IMPLOG gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMPLOG", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMPLOG", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VBCS algorithm else if (this->_use_color_set == 1) { // std::cout << ">>> functorGreedyColor_IMP" << std::endl; // WCMCLEN - functorGreedyColor_IMP gc(this->nv, xadj_, adj_, vertex_colors_, - vertex_color_set, current_vertexList_, + functorGreedyColor_IMP gc(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMP", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMP", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VB algorithm else if (this->_use_color_set == 0) { // std::cout << ">>> functorGreedyColor" << std::endl; // WCMCLEN - functorGreedyColor gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, current_vertexListLength_, + functorGreedyColor gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } } // colorGreedy (end) @@ -544,13 +481,10 @@ class GraphColor_VB * \param current_vertexList_: current conflictlist * \param current_vertexListLength_: size of current conflictlist */ - void colorGreedyEF(const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, - color_view_type vertex_colors_, - nnz_lno_temp_work_view_t vertex_color_set, - nnz_lno_temp_work_view_t current_vertexList_, + void colorGreedyEF(const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, color_view_type vertex_colors_, + nnz_lno_temp_work_view_t vertex_color_set, nnz_lno_temp_work_view_t current_vertexList_, nnz_lno_t current_vertexListLength_) { - nnz_lno_t chunkSize_ = - this->_chunkSize; // Process chunkSize vertices in one chunk + nnz_lno_t chunkSize_ = this->_chunkSize; // Process chunkSize vertices in one chunk if (current_vertexListLength_ < 100 * chunkSize_) chunkSize_ = 1; @@ -559,34 +493,28 @@ class GraphColor_VB // If edge filtering is applied // std::cout << ">>> functorGreedyColor_IMPLOG_EF" << std::endl; // // WCMCLEN - functorGreedyColor_IMPLOG_EF gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, + functorGreedyColor_IMPLOG_EF gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMPLOG_EF", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMPLOG_EF", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VBCS algorithm else if (this->_use_color_set == 1) { // std::cout << ">>> functorGreedyColor_IMP_EF" << std::endl; // // WCMCLEN - functorGreedyColor_IMP_EF gc(this->nv, xadj_, adj_, vertex_colors_, - vertex_color_set, current_vertexList_, + functorGreedyColor_IMP_EF gc(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_IMP_EF", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMP_EF", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // VB algorithm else if (this->_use_color_set == 0) { // std::cout << ">>> functorGreedyColor_EF" << std::endl; // WCMCLEN - functorGreedyColor_EF gc(this->nv, xadj_, adj_, vertex_colors_, - current_vertexList_, current_vertexListLength_, + functorGreedyColor_EF gc(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, current_vertexListLength_, chunkSize_); - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::GreedyColor_EF", - my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_EF", + my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } } @@ -601,85 +529,63 @@ class GraphColor_VB * \param next_iteration_recolorListLength_: size of next conflictlist */ template - nnz_lno_t findConflicts( - bool &swap_work_arrays, const_lno_row_view_t xadj_, adj_view_t adj_, - color_view_type vertex_colors_, - nnz_lno_temp_work_view_t vertex_color_set_, - nnz_lno_temp_work_view_t current_vertexList_, - nnz_lno_t current_vertexListLength_, - nnz_lno_temp_work_view_t next_iteration_recolorList_, - single_dim_index_view_type next_iteration_recolorListLength_) { + nnz_lno_t findConflicts(bool &swap_work_arrays, const_lno_row_view_t xadj_, adj_view_t adj_, + color_view_type vertex_colors_, nnz_lno_temp_work_view_t vertex_color_set_, + nnz_lno_temp_work_view_t current_vertexList_, nnz_lno_t current_vertexListLength_, + nnz_lno_temp_work_view_t next_iteration_recolorList_, + single_dim_index_view_type next_iteration_recolorListLength_) { swap_work_arrays = true; nnz_lno_t numUncolored = 0; if (this->_conflict_scheme == COLORING_NOCONFLICT) { if (this->_use_color_set == 0 || this->_use_color_set == 2) { - functorFindConflicts_No_Conflist conf(this->nv, xadj_, adj_, - vertex_colors_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseA", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_No_Conflist conf(this->nv, xadj_, adj_, vertex_colors_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseA", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { - functorFindConflicts_No_Conflist_IMP conf( - this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseB", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_No_Conflist_IMP conf(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseB", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } } else if (this->_conflict_scheme == COLORING_PPS) { if (this->_use_color_set == 0 || this->_use_color_set == 2) { // Check for conflicts. Compute numUncolored == numConflicts. - functorFindConflicts_PPS conf( - this->nv, xadj_, adj_, vertex_colors_, current_vertexList_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseC", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_PPS conf(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseC", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { - functorFindConflicts_PPS_IMP conf( - this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, - current_vertexList_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflicts::CaseD", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_PPS_IMP conf(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, + current_vertexList_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseD", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } - if (numUncolored && - (current_vertexListLength_ >= this->_min_vertex_cut_off) && - (double(numUncolored) / current_vertexListLength_ < - (1.0 - this->_pps_ratio))) { + if (numUncolored && (current_vertexListLength_ >= this->_min_vertex_cut_off) && + (double(numUncolored) / current_vertexListLength_ < (1.0 - this->_pps_ratio))) { if (this->_ticToc) { - std::cout - << "\tcreating work array with pps current_vertexListLength_:" - << current_vertexListLength_ - << " params->min_vertex_cut_off:" << this->_min_vertex_cut_off - << std::endl; + std::cout << "\tcreating work array with pps current_vertexListLength_:" << current_vertexListLength_ + << " params->min_vertex_cut_off:" << this->_min_vertex_cut_off << std::endl; } single_dim_index_host_view_type h_numUncolored(&numUncolored); Kokkos::deep_copy(next_iteration_recolorListLength_, h_numUncolored); Kokkos::parallel_scan( - "KokkosGraph::GraphColoring::PrefixSum", - my_exec_space(0, current_vertexListLength_), - ppsWorklistFunctorVB( - this->nv, current_vertexList_, next_iteration_recolorList_)); + "KokkosGraph::GraphColoring::PrefixSum", my_exec_space(0, current_vertexListLength_), + ppsWorklistFunctorVB(this->nv, current_vertexList_, next_iteration_recolorList_)); } else { swap_work_arrays = false; } } else { // worklist scheme COLORING_ATOMIC if (this->_use_color_set == 0 || this->_use_color_set == 2) { // Check for conflicts. Compute numUncolored == numConflicts. - functorFindConflicts_Atomic conf( - this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, - next_iteration_recolorList_, next_iteration_recolorListLength_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflictsAtomic", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_Atomic conf(this->nv, xadj_, adj_, vertex_colors_, current_vertexList_, + next_iteration_recolorList_, next_iteration_recolorListLength_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflictsAtomic", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { - functorFindConflicts_Atomic_IMP conf( - this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, - current_vertexList_, next_iteration_recolorList_, - next_iteration_recolorListLength_); - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::FindConflictsAtomic_IMP", - my_exec_space(0, current_vertexListLength_), conf, numUncolored); + functorFindConflicts_Atomic_IMP conf(this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_, + current_vertexList_, next_iteration_recolorList_, + next_iteration_recolorListLength_); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflictsAtomic_IMP", + my_exec_space(0, current_vertexListLength_), conf, numUncolored); } } if (this->_ticToc) { @@ -697,10 +603,8 @@ class GraphColor_VB * \param current_vertexListLength_: size of current conflictlist */ template - void resolveConflicts(nnz_lno_t _nv, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type vertex_colors_, - nnz_lno_temp_work_view_t current_vertexList_, - size_type current_vertexListLength_) { + void resolveConflicts(nnz_lno_t _nv, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type vertex_colors_, + nnz_lno_temp_work_view_t current_vertexList_, size_type current_vertexListLength_) { color_t *forbidden = new color_t[_nv]; nnz_lno_t i = 0; nnz_lno_t end = _nv; @@ -711,10 +615,9 @@ class GraphColor_VB h_recolor_list = Kokkos::create_mirror_view(current_vertexList_); Kokkos::deep_copy(h_recolor_list, current_vertexList_); } - color_host_view_t h_colors = Kokkos::create_mirror_view(vertex_colors_); - typename const_lno_row_view_t::HostMirror h_idx = - Kokkos::create_mirror_view(xadj_); - typename adj_view_t::HostMirror h_adj = Kokkos::create_mirror_view(adj_); + color_host_view_t h_colors = Kokkos::create_mirror_view(vertex_colors_); + typename const_lno_row_view_t::HostMirror h_idx = Kokkos::create_mirror_view(xadj_); + typename adj_view_t::HostMirror h_adj = Kokkos::create_mirror_view(adj_); Kokkos::deep_copy(h_colors, vertex_colors_); Kokkos::deep_copy(h_idx, xadj_); @@ -756,12 +659,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMPLOG_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, - nnz_lno_temp_work_view_t adj_, - color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, - nnz_lno_t chunkSize) + functorGreedyColor_IMPLOG_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t vertexList, + nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -794,8 +694,7 @@ class GraphColor_VB // we parse the neigborlist multiple times, // each time we look for a certain range of colors. - for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); - offset += VBBIT_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); offset += VBBIT_COLORING_FORBIDDEN_SIZE) { // Forbidden colors // we use a single (long) int for forbidden colors ban_type forbidden = 0; @@ -867,10 +766,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMPLOG(nnz_lno_t nv_, const_lno_row_view_t xadj_, - const_lno_nnz_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor_IMPLOG(nnz_lno_t nv_, const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, + nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -896,8 +794,7 @@ class GraphColor_VB color_t degree = my_xadj_end - xadjbegin; // My degree color_t offset = 0; - for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); - offset += VBBIT_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); offset += VBBIT_COLORING_FORBIDDEN_SIZE) { ban_type forbidden = 0; // Forbidden colors // Check nbors, fill forbidden array. @@ -950,12 +847,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMP_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, - nnz_lno_temp_work_view_t adj_, - color_view_type colors, - nnz_lno_temp_work_view_t color_set, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor_IMP_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t color_set, + nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _xadj(xadj_), _adj(adj_), @@ -1033,10 +927,8 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, - const_lno_nnz_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t color_set, - nnz_lno_temp_work_view_t vertexList, + functorGreedyColor_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t color_set, nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _xadj(xadj_), @@ -1105,10 +997,9 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, - nnz_lno_temp_work_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, + nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -1150,8 +1041,7 @@ class GraphColor_VB color_t offset = 0; size_type xadjbegin = _idx(i); - for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); - offset += VB_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); offset += VB_COLORING_FORBIDDEN_SIZE) { // initialize for (int j = 0; j < VB_COLORING_FORBIDDEN_SIZE; j++) { forbidden[j] = false; @@ -1211,10 +1101,8 @@ class GraphColor_VB nnz_lno_t _vertexListLength; nnz_lno_t _chunkSize; - functorGreedyColor(nnz_lno_t nv_, const_lno_row_view_t xadj_, - const_lno_nnz_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_t vertexListLength, nnz_lno_t chunkSize) + functorGreedyColor(nnz_lno_t nv_, const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t vertexList, nnz_lno_t vertexListLength, nnz_lno_t chunkSize) : nv(nv_), _idx(xadj_), _adj(adj_), @@ -1253,8 +1141,7 @@ class GraphColor_VB // Do multiple passes if array is too small. color_t degree = _idx(i + 1) - _idx(i); // My degree color_t offset = 1; - for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); - offset += VB_COLORING_FORBIDDEN_SIZE) { + for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); offset += VB_COLORING_FORBIDDEN_SIZE) { // initialize for (int j = 0; j < VB_COLORING_FORBIDDEN_SIZE; j++) { forbidden[j] = false; @@ -1271,8 +1158,7 @@ class GraphColor_VB // foundColor = true; // return; //} - if ((c >= offset) && (c - offset < VB_COLORING_FORBIDDEN_SIZE)) - forbidden[c - offset] = true; + if ((c >= offset) && (c - offset < VB_COLORING_FORBIDDEN_SIZE)) forbidden[c - offset] = true; } // color vertex i with smallest available color (FirstFit) @@ -1302,8 +1188,7 @@ class GraphColor_VB adj_view_t _adj; color_view_type _colors; - functorFindConflicts_No_Conflist(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors) + functorFindConflicts_No_Conflist(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors) : nv(nv_), _idx(xadj_), _adj(adj_), _colors(colors) {} KOKKOS_INLINE_FUNCTION @@ -1323,9 +1208,8 @@ class GraphColor_VB #endif _colors(neighbor) == my_color #ifdef DEGREECOMP - && - (myDegree < _idx(neighbor + 1) - _idx(neighbor) || - (myDegree == _idx(neighbor + 1) - _idx(neighbor) && ii < neighbor)) + && (myDegree < _idx(neighbor + 1) - _idx(neighbor) || + (myDegree == _idx(neighbor + 1) - _idx(neighbor) && ii < neighbor)) #endif ) { // std::cout << "me:" << ii << " n:" << neighbor << " color:" << @@ -1350,14 +1234,9 @@ class GraphColor_VB color_view_type _colors; nnz_lno_temp_work_view_t _vertexList; - functorFindConflicts_PPS(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors, + functorFindConflicts_PPS(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, nnz_lno_temp_work_view_t vertexList) - : nv(nv_), - _idx(xadj_), - _adj(adj_), - _colors(colors), - _vertexList(vertexList) {} + : nv(nv_), _idx(xadj_), _adj(adj_), _colors(colors), _vertexList(vertexList) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { @@ -1378,9 +1257,8 @@ class GraphColor_VB #endif _colors(neighbor) == my_color #ifdef DEGREECOMP - && - (myDegree < _idx(neighbor + 1) - _idx(neighbor) || - (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) + && (myDegree < _idx(neighbor + 1) - _idx(neighbor) || + (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i @@ -1405,10 +1283,8 @@ class GraphColor_VB nnz_lno_temp_work_view_t _recolorList; single_dim_index_view_type _recolorListLength; - functorFindConflicts_Atomic(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_temp_work_view_t recolorList, + functorFindConflicts_Atomic(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t vertexList, nnz_lno_temp_work_view_t recolorList, single_dim_index_view_type recolorListLength) : nv(nv_), _idx(xadj_), @@ -1420,9 +1296,7 @@ class GraphColor_VB KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { - typedef - typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; nnz_lno_t i = _vertexList(ii); color_t my_color = _colors(i); @@ -1441,15 +1315,13 @@ class GraphColor_VB #endif _colors(neighbor) == my_color #ifdef DEGREECOMP - && - (myDegree < _idx(neighbor + 1) - _idx(neighbor) || - (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) + && (myDegree < _idx(neighbor + 1) - _idx(neighbor) || + (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i // Atomically add vertex i to recolorList - const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), - atomic_incr_type(1)); + const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), atomic_incr_type(1)); _recolorList(k) = i; numConflicts += 1; break; // Once i is uncolored and marked conflict @@ -1470,16 +1342,9 @@ class GraphColor_VB color_view_type _colors; nnz_lno_temp_work_view_t _color_sets; - functorFindConflicts_No_Conflist_IMP(nnz_lno_t nv_, - const_lno_row_view_t xadj_, - adj_view_t adj_, - color_view_type colors, - nnz_lno_temp_work_view_t color_sets) - : nv(nv_), - _xadj(xadj_), - _adj(adj_), - _colors(colors), - _color_sets(color_sets) {} + functorFindConflicts_No_Conflist_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, + color_view_type colors, nnz_lno_temp_work_view_t color_sets) + : nv(nv_), _xadj(xadj_), _adj(adj_), _colors(colors), _color_sets(color_sets) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { @@ -1504,12 +1369,10 @@ class GraphColor_VB #ifndef DEGREECOMP ii < neighbor && neighbor < nv && #endif - _colors(neighbor) == my_color && - my_color_set == _color_sets(neighbor) + _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor) #ifdef DEGREECOMP && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) || - (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && - ii < neighbor)) + (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && ii < neighbor)) #endif ) { _colors(ii) = 0; // Uncolor vertex i @@ -1535,16 +1398,9 @@ class GraphColor_VB nnz_lno_temp_work_view_t _color_sets; nnz_lno_temp_work_view_t _vertexList; - functorFindConflicts_PPS_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, - adj_view_t adj_, color_view_type colors, - nnz_lno_temp_work_view_t color_sets, - nnz_lno_temp_work_view_t vertexList) - : nv(nv_), - _xadj(xadj_), - _adj(adj_), - _colors(colors), - _color_sets(color_sets), - _vertexList(vertexList) {} + functorFindConflicts_PPS_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t color_sets, nnz_lno_temp_work_view_t vertexList) + : nv(nv_), _xadj(xadj_), _adj(adj_), _colors(colors), _color_sets(color_sets), _vertexList(vertexList) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { @@ -1570,12 +1426,10 @@ class GraphColor_VB #ifndef DEGREECOMP i < neighbor && neighbor < nv && #endif - _colors(neighbor) == my_color && - my_color_set == _color_sets(neighbor) + _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor) #ifdef DEGREECOMP && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) || - (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && - i < neighbor)) + (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i @@ -1603,12 +1457,9 @@ class GraphColor_VB nnz_lno_temp_work_view_t _recolorList; single_dim_index_view_type _recolorListLength; - functorFindConflicts_Atomic_IMP( - nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, - color_view_type colors, nnz_lno_temp_work_view_t color_sets, - nnz_lno_temp_work_view_t vertexList, - nnz_lno_temp_work_view_t recolorList, - single_dim_index_view_type recolorListLength) + functorFindConflicts_Atomic_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_, color_view_type colors, + nnz_lno_temp_work_view_t color_sets, nnz_lno_temp_work_view_t vertexList, + nnz_lno_temp_work_view_t recolorList, single_dim_index_view_type recolorListLength) : nv(nv_), _xadj(xadj_), _adj(adj_), @@ -1620,16 +1471,13 @@ class GraphColor_VB KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const { - typedef - typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; nnz_lno_t i = _vertexList(ii); color_t my_color = _colors(i); if (my_color == 0) { // this should only happen when one_color_set_per_iteration is set to // true. - const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), - atomic_incr_type(1)); + const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), atomic_incr_type(1)); _recolorList(k) = i; numConflicts++; } else { @@ -1647,19 +1495,16 @@ class GraphColor_VB #ifndef DEGREECOMP i < neighbor && neighbor < nv && #endif - _colors(neighbor) == my_color && - my_color_set == _color_sets(neighbor) + _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor) #ifdef DEGREECOMP && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) || - (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && - i < neighbor)) + (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && i < neighbor)) #endif ) { _colors(i) = 0; // Uncolor vertex i _color_sets(i) = 0; // Atomically add vertex i to recolorList - const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), - atomic_incr_type(1)); + const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), atomic_incr_type(1)); _recolorList(k) = i; numConflicts++; break; // Once i is uncolored and marked conflict @@ -1690,8 +1535,7 @@ class GraphColor_VB view_type _vertexList; view_type _recolorList; - ppsWorklistFunctorVB(nnz_lno_t nv_, const view_type &vertexList, - const view_type &recolorList) + ppsWorklistFunctorVB(nnz_lno_t nv_, const view_type &vertexList, const view_type &recolorList) : _nv(nv_), _vertexList(vertexList), _recolorList(recolorList) {} KOKKOS_INLINE_FUNCTION @@ -1709,9 +1553,8 @@ class GraphColor_VB */ struct set_final_colors { color_view_type kokcol; - nnz_lno_temp_work_view_t - kokcolset; // the colors that are represented with bits, and the colors - // set that the color is in. + nnz_lno_temp_work_view_t kokcolset; // the colors that are represented with bits, and the colors + // set that the color is in. color_t color_size; /** \brief functor constructor. @@ -1720,11 +1563,8 @@ class GraphColor_VB * color_set_ together is used to represent the colors e.g. color_set_(v) * * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v) */ - set_final_colors(color_view_type kokcol_, - nnz_lno_temp_work_view_t kokcolset_) - : kokcol(kokcol_), - kokcolset(kokcolset_), - color_size(sizeof(color_t) * 8) {} + set_final_colors(color_view_type kokcol_, nnz_lno_temp_work_view_t kokcolset_) + : kokcol(kokcol_), kokcolset(kokcolset_), color_size(sizeof(color_t) * 8) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t &ii) const { @@ -1747,10 +1587,8 @@ class GraphColor_VB /*! \brief Class for the deterministic vertex based graph coloring algorithms. */ -template -class GraphColor_VBD - : public GraphColor { +template +class GraphColor_VBD : public GraphColor { public: typedef long long int ban_type; @@ -1764,30 +1602,22 @@ class GraphColor_VBD typedef typename HandleType::nnz_lno_t nnz_lno_t; typedef typename HandleType::color_t color_t; - typedef typename HandleType::color_host_view_t - color_host_view_t; // Host view type + typedef typename HandleType::color_host_view_t color_host_view_t; // Host view type typedef typename HandleType::HandleExecSpace MyExecSpace; typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef - typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - typedef typename Kokkos::View - single_dim_index_view_type; - typedef typename single_dim_index_view_type::HostMirror - single_dim_index_host_view_type; // Host view type + typedef typename Kokkos::View single_dim_index_view_type; + typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; // Host view type typedef Kokkos::RangePolicy my_exec_space; - typedef typename HandleType::size_type_temp_work_view_t - size_type_temp_work_view_t; - typedef typename HandleType::size_type_persistent_work_view_t - size_type_persistent_work_view_t; + typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t; + typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t; - typedef - typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; typedef typename in_lno_row_view_t::const_type const_lno_row_view_t; @@ -1795,9 +1625,9 @@ class GraphColor_VBD typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t; protected: - bool _ticToc; // if true print info in each step - int _chunkSize; // the size of the minimum work unit assigned to threads. - // Changes the convergence on GPUs + bool _ticToc; // if true print info in each step + int _chunkSize; // the size of the minimum work unit assigned to threads. + // Changes the convergence on GPUs char _use_color_set; // the VBD algorithm type. // 0 for VBD: @@ -1811,10 +1641,9 @@ class GraphColor_VBD * \param coloring_handle: GraphColoringHandle object that holds the * specification about the graph coloring, including parameters. */ - GraphColor_VBD(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, - const_lno_nnz_view_t entries, HandleType *coloring_handle) - : GraphColor( - nv_, ne_, row_map, entries, coloring_handle), + GraphColor_VBD(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map, const_lno_nnz_view_t entries, + HandleType *coloring_handle) + : GraphColor(nv_, ne_, row_map, entries, coloring_handle), _ticToc(coloring_handle->get_tictoc()), _chunkSize(coloring_handle->get_vb_chunk_size()), _use_color_set() { @@ -1850,15 +1679,13 @@ class GraphColor_VBD nnz_lno_t numVertices = this->nv; - size_type maxColors = 0; - nnz_lno_persistent_work_view_t score = nnz_lno_persistent_work_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "score"), this->nv); - functorScoreCalculation scoreCalculation( - score, this->xadj); + size_type maxColors = 0; + nnz_lno_persistent_work_view_t score = + nnz_lno_persistent_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "score"), this->nv); + functorScoreCalculation scoreCalculation(score, this->xadj); - Kokkos::parallel_reduce("Deterministic Coloring: compute initial scores", - my_exec_space(0, this->nv), scoreCalculation, - Kokkos::Max(maxColors)); + Kokkos::parallel_reduce("Deterministic Coloring: compute initial scores", my_exec_space(0, this->nv), + scoreCalculation, Kokkos::Max(maxColors)); if (this->_ticToc) { std::cout << "maxColors: " << maxColors << std::endl; @@ -1867,18 +1694,17 @@ class GraphColor_VBD // Create the dependency list of the graph nnz_lno_persistent_work_view_t dependency("dependency", numVertices); Kokkos::View frontierSize("frontierSize"); - typename Kokkos::View::HostMirror - host_frontierSize = Kokkos::create_mirror_view(frontierSize); - Kokkos::View newFrontierSize( - "newFrontierSize"); - typename Kokkos::View::HostMirror - host_newFrontierSize = Kokkos::create_mirror_view(newFrontierSize); + typename Kokkos::View::HostMirror host_frontierSize = + Kokkos::create_mirror_view(frontierSize); + Kokkos::View newFrontierSize("newFrontierSize"); + typename Kokkos::View::HostMirror host_newFrontierSize = + Kokkos::create_mirror_view(newFrontierSize); nnz_lno_temp_work_view_t frontier("frontier", numVertices); nnz_lno_temp_work_view_t newFrontier("newFrontier", numVertices); - functorInitialDependency myInitialDependency( - this->xadj, this->adj, score, dependency, newFrontier, newFrontierSize); - Kokkos::parallel_for("Deterministic Coloring: compute dependency list", - my_exec_space(0, numVertices), myInitialDependency); + functorInitialDependency myInitialDependency(this->xadj, this->adj, score, dependency, newFrontier, + newFrontierSize); + Kokkos::parallel_for("Deterministic Coloring: compute dependency list", my_exec_space(0, numVertices), + myInitialDependency); Kokkos::deep_copy(host_newFrontierSize, newFrontierSize); while (host_newFrontierSize() > 0) { @@ -1886,8 +1712,7 @@ class GraphColor_VBD // First swap fontier with newFrontier and fontierSize with // newFrontierSize reset newFrontierSize functorSwapOnDevice mySwapOnDevice(frontierSize, newFrontierSize); - Kokkos::parallel_for("Swap frontier sizes", my_exec_space(0, 1), - mySwapOnDevice); + Kokkos::parallel_for("Swap frontier sizes", my_exec_space(0, 1), mySwapOnDevice); Kokkos::deep_copy(host_frontierSize, frontierSize); { auto swap_tmp = frontier; @@ -1898,11 +1723,9 @@ class GraphColor_VBD // Loop over nodes in the frontier // First variant without bit array, easier to understand/program if (this->_use_color_set == 0) { - functorDeterministicColoring myDeterministicColoring( - this->xadj, this->adj, dependency, frontier, frontierSize, - newFrontier, newFrontierSize, maxColors, colors); - Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier", - my_exec_space(0, host_frontierSize()), + functorDeterministicColoring myDeterministicColoring(this->xadj, this->adj, dependency, frontier, frontierSize, + newFrontier, newFrontierSize, maxColors, colors); + Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier", my_exec_space(0, host_frontierSize()), myDeterministicColoring); } else if (this->_use_color_set == 1) { @@ -1911,12 +1734,9 @@ class GraphColor_VBD // we need to use successive color ranges of width 64 // to represent all the possible colors on the graph. functorDeterministicColoringBitArray myDeterministicColoringBitArray( - this->xadj, this->adj, dependency, frontier, frontierSize, - newFrontier, newFrontierSize, maxColors, colors); - Kokkos::parallel_for( - "Deterministic Coloring: color nodes in frontier", - my_exec_space(0, host_frontierSize()), - myDeterministicColoringBitArray); // Loop over current frontier + this->xadj, this->adj, dependency, frontier, frontierSize, newFrontier, newFrontierSize, maxColors, colors); + Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier", my_exec_space(0, host_frontierSize()), + myDeterministicColoringBitArray); // Loop over current frontier } Kokkos::deep_copy(host_newFrontierSize, newFrontierSize); } // while newFrontierSize @@ -1928,14 +1748,13 @@ class GraphColor_VBD nnz_lno_persistent_work_view_t score_; const_lno_row_view_t numNeighbors_; - functorScoreCalculation(nnz_lno_persistent_work_view_t &score, - const_lno_row_view_t &numNeighbors) + functorScoreCalculation(nnz_lno_persistent_work_view_t &score, const_lno_row_view_t &numNeighbors) : score_(score), numNeighbors_(numNeighbors) {} KOKKOS_INLINE_FUNCTION void operator()(const int i, size_type &update) const { score_(i) = numNeighbors_(i + 1) - numNeighbors_(i); - update = ((size_type)score_(i) < update ? update : (size_type)score_(i)); + update = ((size_type)score_(i) < update ? update : (size_type)score_(i)); } }; // functorScoreCalculation() @@ -1943,9 +1762,8 @@ class GraphColor_VBD Kokkos::View frontierSize_; Kokkos::View newFrontierSize_; - functorSwapOnDevice( - Kokkos::View frontierSize, - Kokkos::View newFrontierSize) + functorSwapOnDevice(Kokkos::View frontierSize, + Kokkos::View newFrontierSize) : frontierSize_(frontierSize), newFrontierSize_(newFrontierSize) {} KOKKOS_INLINE_FUNCTION @@ -1964,12 +1782,10 @@ class GraphColor_VBD nnz_lno_temp_work_view_t newFrontier_; Kokkos::View newFrontierSize_; - functorInitialDependency( - const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t score, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize) + functorInitialDependency(const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t score, nnz_lno_persistent_work_view_t dependency, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize) : xadj_(rowPtr), adj_(colInd), score_(score), @@ -1979,8 +1795,7 @@ class GraphColor_VBD KOKKOS_INLINE_FUNCTION void operator()(const int node) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; int myScore = score_(node); int numNeighs = xadj_(node + 1) - xadj_(node); nnz_lno_t numVerts = xadj_.extent(0) - 1; @@ -1996,9 +1811,8 @@ class GraphColor_VBD } } if (dependency_(node) == 0) { - const size_type newFrontierIdx = - Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); - newFrontier_(newFrontierIdx) = node; + const size_type newFrontierIdx = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); + newFrontier_(newFrontierIdx) = node; } } @@ -2016,14 +1830,12 @@ class GraphColor_VBD color_view_type colors_; Kokkos::View bannedColors_; - functorDeterministicColoring( - const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t frontier, - Kokkos::View frontierSize, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize, - size_type maxColors, color_view_type colors) + functorDeterministicColoring(const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t dependency, nnz_lno_temp_work_view_t frontier, + Kokkos::View frontierSize, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize, size_type maxColors, + color_view_type colors) : xadj_(rowPtr), adj_(colInd), dependency_(dependency), @@ -2033,14 +1845,12 @@ class GraphColor_VBD newFrontierSize_(newFrontierSize), maxColors_(maxColors), colors_(colors), - bannedColors_("KokkosKernels::bannedColors", frontier.size(), - maxColors_) {} + bannedColors_("KokkosKernels::bannedColors", frontier.size(), maxColors_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type frontierIdx) const { nnz_lno_t numVerts = xadj_.extent(0) - 1; - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; size_type frontierNode = frontier_(frontierIdx); for (size_type colorIdx = 0; colorIdx < maxColors_; ++colorIdx) { bannedColors_(frontierIdx, colorIdx) = 0; @@ -2048,8 +1858,7 @@ class GraphColor_VBD // Loop over neighbors, find banned colors, decrement dependency and // update newFrontier - for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); - ++i) { + for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); ++i) { nnz_lno_t neigh = adj_(i); // Skip remote edges (in case this is part of a distributed graph) if (neigh >= numVerts) continue; @@ -2059,13 +1868,11 @@ class GraphColor_VBD // so let's check that the node is not already colored, i.e. // its dependency is not -1. if (dependency_(neigh) >= 0) { - nnz_lno_t myDependency = - Kokkos::atomic_fetch_add(&dependency_(neigh), -1); + nnz_lno_t myDependency = Kokkos::atomic_fetch_add(&dependency_(neigh), -1); // dependency(neigh) = dependency(neigh) - 1; if (myDependency - 1 == 0) { - const size_type newFrontierIdx = Kokkos::atomic_fetch_add( - &newFrontierSize_(), atomic_incr_type(1)); - newFrontier_(newFrontierIdx) = neigh; + const size_type newFrontierIdx = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); + newFrontier_(newFrontierIdx) = neigh; } } } // Loop over neighbors @@ -2090,14 +1897,12 @@ class GraphColor_VBD size_type maxColors_; color_view_type colors_; - functorDeterministicColoringBitArray( - const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t frontier, - Kokkos::View frontierSize, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize, - size_type maxColors, color_view_type colors) + functorDeterministicColoringBitArray(const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t dependency, nnz_lno_temp_work_view_t frontier, + Kokkos::View frontierSize, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize, + size_type maxColors, color_view_type colors) : xadj_(rowPtr), adj_(colInd), dependency_(dependency), @@ -2110,8 +1915,7 @@ class GraphColor_VBD KOKKOS_INLINE_FUNCTION void operator()(const size_type frontierIdx) const { - typedef typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; nnz_lno_t numVerts = xadj_.extent(0) - 1; size_type frontierNode = frontier_(frontierIdx); // Initialize bit array to all bits = 0 @@ -2121,8 +1925,7 @@ class GraphColor_VBD while (myColor == 0) { // Loop over neighbors, find banned colors in the range: // [colorOffset + 1, colorOffset + 64] - for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); - ++i) { + for (size_type i = xadj_(frontierNode); i < xadj_(frontierNode + 1); ++i) { nnz_lno_t neigh = adj_(i); if (neigh >= numVerts) continue; color_t neighColor = colors_(neigh); @@ -2136,12 +1939,10 @@ class GraphColor_VBD // so let's check that the node is not already colored, i.e. // its dependency is not -1. if (colorOffset == 0 && dependency_(neigh) >= 0) { - nnz_lno_t myDependency = - Kokkos::atomic_fetch_add(&dependency_(neigh), -1); + nnz_lno_t myDependency = Kokkos::atomic_fetch_add(&dependency_(neigh), -1); if (myDependency - 1 == 0) { - const size_type newFrontierIdx = Kokkos::atomic_fetch_add( - &newFrontierSize_(), atomic_incr_type(1)); - newFrontier_(newFrontierIdx) = neigh; + const size_type newFrontierIdx = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1)); + newFrontier_(newFrontierIdx) = neigh; } } } // Loop over neighbors @@ -2169,10 +1970,8 @@ class GraphColor_VBD * Performs a edge_base coloring, with the hope of better load balance * as well as better memory accesses on GPUs. */ -template -class GraphColor_EB : public GraphColor { +template +class GraphColor_EB : public GraphColor { // FIXME SYCL: This does not work, returns colors with conflicts. public: typedef long long int ban_type; @@ -2187,41 +1986,30 @@ class GraphColor_EB : public GraphColor - single_dim_index_view_type; + typedef typename Kokkos::View single_dim_index_view_type; - typedef typename single_dim_index_view_type::HostMirror - single_dim_index_host_view_type; // Host view type + typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; // Host view type typedef Kokkos::RangePolicy my_exec_space; - typedef typename HandleType::size_type_temp_work_view_t - size_type_temp_work_view_t; - typedef typename HandleType::size_type_persistent_work_view_t - size_type_persistent_work_view_t; + typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t; + typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t; - typedef - typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; - typedef typename Kokkos::View - color_temp_work_view_type; + typedef typename Kokkos::View color_temp_work_view_type; typedef Kokkos::View char_temp_work_view_type; - typedef typename char_temp_work_view_type::HostMirror - char_temp_work_host_view_type; // Host view type + typedef typename char_temp_work_view_type::HostMirror char_temp_work_host_view_type; // Host view type typedef typename in_row_index_view_type::const_type const_lno_row_view_t; - typedef typename in_nonzero_index_view_type::const_type - const_nonzero_index_view_type; + typedef typename in_nonzero_index_view_type::const_type const_nonzero_index_view_type; public: /** @@ -2231,12 +2019,10 @@ class GraphColor_EB : public GraphColor(nv_, ne_, row_map, entries, - coloring_handle) {} + : GraphColor(nv_, ne_, row_map, entries, + coloring_handle) {} /** * \brief Class Destructor. @@ -2256,7 +2042,7 @@ class GraphColor_EB : public GraphColorcp->get_eb_num_initial_colors(); double pps_cutoff = this->cp->get_min_reduction_for_conflictlist(); size_type ps_min = this->cp->get_min_elements_for_conflictlist(); - bool use_pps = (this->cp->get_conflict_list_type() == COLORING_PPS); + bool use_pps = (this->cp->get_conflict_list_type() == COLORING_PPS); bool tictoc = this->cp->get_tictoc(); @@ -2264,53 +2050,40 @@ class GraphColor_EB : public GraphColorcp->get_lower_diagonal_edge_list(this->nv, this->ne, this->xadj, - this->adj, numEdges, _kok_src, - _kok_dst); + this->cp->get_lower_diagonal_edge_list(this->nv, this->ne, this->xadj, this->adj, numEdges, _kok_src, _kok_dst); size_type num_work_edges = numEdges; // allocate memory for vertex ban colors, and tentative bans - color_temp_work_view_type color_ban( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "color_ban"), this->nv); - color_temp_work_view_type tentative_color_ban( - "tentative_color_ban", this->nv); // views are initialized with zero + color_temp_work_view_type color_ban(Kokkos::view_alloc(Kokkos::WithoutInitializing, "color_ban"), this->nv); + color_temp_work_view_type tentative_color_ban("tentative_color_ban", this->nv); // views are initialized with zero // allocate memory for vertex color set shifts. nnz_lno_temp_work_view_t color_set("color_set", this->nv); // initialized with zero. // initialize colors, color bans - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::initColors", my_exec_space(0, this->nv), - init_colors(kok_colors, color_ban, numInitialColors, color_set)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::initColors", my_exec_space(0, this->nv), + init_colors(kok_colors, color_ban, numInitialColors, color_set)); // std::cout << "nv:" << this->nv << " init_colors" << std::endl; // worklist size_type_temp_work_view_t edge_conflict_indices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "edge_conflict_indices"), - num_work_edges); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_indices"), num_work_edges); // next iterations conflict list size_type_temp_work_view_t new_edge_conflict_indices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "new_edge_conflict_indices"), - num_work_edges); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_edge_conflict_indices"), num_work_edges); char_temp_work_view_type edge_conflict_marker( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_marker"), - num_work_edges); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_marker"), num_work_edges); // initialize the worklist sequentiall, and markers as 1. - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::InitWorkArrays", - my_exec_space(0, num_work_edges), - init_work_arrays(edge_conflict_indices, edge_conflict_marker)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::InitWorkArrays", my_exec_space(0, num_work_edges), + init_work_arrays(edge_conflict_indices, edge_conflict_marker)); MyExecSpace().fence(); // std::cout << "nv:" << this->nv << " init_work_arrays" << std::endl; @@ -2319,8 +2092,7 @@ class GraphColor_EB : public GraphColorseconds(); timer->reset(); } - double mc_time = 0, cnt_time = 0, ban_time = 0, expand_ban_time = 0, - color_time = 0, pps_time = 0; + double mc_time = 0, cnt_time = 0, ban_time = 0, expand_ban_time = 0, color_time = 0, pps_time = 0; size_type i = 0; @@ -2340,12 +2112,9 @@ class GraphColor_EB : public GraphColornv << " i:" << i << " num_work_edges:" << // num_work_edges<< std::endl; conflict detection mark conflicts as color // 0. update their bans - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::HalfEdgeMarkConflicts", - my_exec_space(0, num_work_edges), - halfedge_mark_conflicts(_kok_src, _kok_dst, kok_colors, color_set, - color_ban, tentative_color_ban, - edge_conflict_indices)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::HalfEdgeMarkConflicts", my_exec_space(0, num_work_edges), + halfedge_mark_conflicts(_kok_src, _kok_dst, kok_colors, color_set, color_ban, + tentative_color_ban, edge_conflict_indices)); MyExecSpace().fence(); // std::cout << "nv:" << this->nv << " i:" << i << " @@ -2361,13 +2130,10 @@ class GraphColor_EB : public GraphColor 0) - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::HalfEdgeConflictsCount", - my_exec_space(0, num_work_edges), - halfedge_conflict_count(_kok_src, _kok_dst, kok_colors, color_set, - edge_conflict_indices, - edge_conflict_marker), - num_conflict_reduction); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::HalfEdgeConflictsCount", my_exec_space(0, num_work_edges), + halfedge_conflict_count(_kok_src, _kok_dst, kok_colors, color_set, + edge_conflict_indices, edge_conflict_marker), + num_conflict_reduction); MyExecSpace().fence(); @@ -2396,26 +2162,19 @@ class GraphColor_EB : public GraphColor ps_min && - num_conflict_reduction / double(num_work_edges) > pps_cutoff) { + if (num_work_edges > ps_min && num_conflict_reduction / double(num_work_edges) > pps_cutoff) { // use_pps = false; if (use_pps) { - Kokkos::parallel_scan("KokkosGraph::GraphColoring::CalcEdgePositions", - my_exec_space(0, num_work_edges), - ppsWorklistFunctorEB(edge_conflict_indices, - new_edge_conflict_indices, - edge_conflict_marker)); + Kokkos::parallel_scan( + "KokkosGraph::GraphColoring::CalcEdgePositions", my_exec_space(0, num_work_edges), + ppsWorklistFunctorEB(edge_conflict_indices, new_edge_conflict_indices, edge_conflict_marker)); } else { // create new worklist - single_dim_index_view_type new_index = - single_dim_index_view_type("recolorListLength"); + single_dim_index_view_type new_index = single_dim_index_view_type("recolorListLength"); ; - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::CreateNewWorkArrayAtomic", - my_exec_space(0, num_work_edges), - atomic_create_new_work_array(new_index, edge_conflict_indices, - edge_conflict_marker, - new_edge_conflict_indices)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::CreateNewWorkArrayAtomic", my_exec_space(0, num_work_edges), + atomic_create_new_work_array(new_index, edge_conflict_indices, edge_conflict_marker, + new_edge_conflict_indices)); MyExecSpace().fence(); } @@ -2433,12 +2192,9 @@ class GraphColor_EB : public GraphColorseconds(); @@ -2463,27 +2217,22 @@ class GraphColor_EB : public GraphColornv), - choose_colors(kok_colors, color_set, color_ban, tentative_color_ban)); + Kokkos::parallel_for("KokkosGraph::GraphColoring::ChooseColors", my_exec_space(0, this->nv), + choose_colors(kok_colors, color_set, color_ban, tentative_color_ban)); if (tictoc) { color_time += timer->seconds(); timer->reset(); } } if (tictoc) { - std::cout << "\tinit_time:" << inittime << " mc:" << mc_time - << " cnt_time:" << cnt_time << " ban_time:" << ban_time - << " expand ban time:" << expand_ban_time - << " pps time:" << pps_time << " color time:" << color_time - << std::endl + std::cout << "\tinit_time:" << inittime << " mc:" << mc_time << " cnt_time:" << cnt_time + << " ban_time:" << ban_time << " expand ban time:" << expand_ban_time << " pps time:" << pps_time + << " color time:" << color_time << std::endl << std::endl; } // set the final colors. - Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", - my_exec_space(0, this->nv), + Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors", my_exec_space(0, this->nv), set_final_colors(kok_colors, color_set)); num_loops = i; @@ -2500,7 +2249,7 @@ class GraphColor_EB : public GraphColor - _color_set(s))) || // if source is colored, and destination - // color set is larger than source - (dc && (_color_set(s) > - _color_set(d))) // or if destionation is colored, and the - // source color set is larger + if ((dc && sc) || // if both colored + (sc && (_color_set(d) > _color_set(s))) || // if source is colored, and destination + // color set is larger than source + (dc && (_color_set(s) > _color_set(d))) // or if destionation is colored, and the + // source color set is larger ) { // then no need to look at this edge anymore. _edge_conflict_marker(w) = 0; @@ -2696,8 +2430,7 @@ class GraphColor_EB : public GraphColor::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; size_type w = _edge_conflict_indices(ii); if (_edge_conflict_marker(w)) { - const size_type future_index = - Kokkos::atomic_fetch_add(&_new_index(), atomic_incr_type(1)); + const size_type future_index = Kokkos::atomic_fetch_add(&_new_index(), atomic_incr_type(1)); _new_edge_conflict_indices(future_index) = w; } } @@ -2751,11 +2480,9 @@ class GraphColor_EB : public GraphColor(&(color_ban(uncolored_vertex)), - src_col | dst_col); + Kokkos::atomic_fetch_or(&(color_ban(uncolored_vertex)), src_col | dst_col); edge_conflict_marker(work_index) = 0; } } @@ -2821,9 +2544,8 @@ class GraphColor_EB : public GraphColor dst_id) ? src_id : dst_id; - nnz_lno_t smaller_index = - dst_id; // TODO which one is better? this seems to be not - // much changing + nnz_lno_t smaller_index = dst_id; // TODO which one is better? this seems to be not + // much changing // idx smaller_index = src_id; // then both have been colored tentavitely. propoagate the color // of src to dst. - Kokkos::atomic_fetch_or( - &(tentative_color_ban(smaller_index)), -src_col); - nnz_lno_t banned_colors = ~(color_ban(smaller_index) | - tentative_color_ban(smaller_index)); + Kokkos::atomic_fetch_or(&(tentative_color_ban(smaller_index)), -src_col); + nnz_lno_t banned_colors = ~(color_ban(smaller_index) | tentative_color_ban(smaller_index)); nnz_lno_t larger_col = banned_colors & (-banned_colors); kokcolors(smaller_index) = -(larger_col); } @@ -2909,16 +2627,14 @@ class GraphColor_EB : public GraphColor(&(color_ban(dst_id)), // -src_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), - -src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), -src_col); } else if (dst_col != 0) { // if it is dst tentatively colors, but src is not colored, // then we send the dst color info to src's tentative_ban // Kokkos::atomic_fetch_or(&(color_ban(src_id)), // -dst_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), - -dst_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), -dst_col); } else { // idx smaller_index = src_id < dst_id > 0 ? src_id: dst_id; // idx larger_index = src_id < dst_id > 0 ? dst_id : src_id; @@ -2937,16 +2653,14 @@ class GraphColor_EB : public GraphColor( - &(tentative_color_ban(larger_index)), src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(larger_index)), src_col); // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), // src_col); } @@ -2961,15 +2675,13 @@ class GraphColor_EB : public GraphColor -void graph_color_impl(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - lno_row_view_t_ row_map, lno_nnz_view_t_ entries) { +template +void graph_color_impl(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries) { Kokkos::Timer timer; - typename KernelHandle::GraphColoringHandleType *gch = - handle->get_graph_coloring_handle(); + typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle(); ColoringAlgorithm algorithm = gch->get_coloring_algo_type(); - typedef typename KernelHandle::GraphColoringHandleType::color_view_t - color_view_type; + typedef typename KernelHandle::GraphColoringHandleType::color_view_t color_view_type; gch->set_tictoc(handle->get_verbose()); @@ -3119,46 +2820,35 @@ void graph_color_impl(KernelHandle *handle, colors_out = color_view_type("Graph Colors", num_rows); } - typedef - typename Impl::GraphColor - BaseGraphColoring; + typedef typename Impl::GraphColor + BaseGraphColoring; BaseGraphColoring *gc = NULL; switch (algorithm) { - case COLORING_SERIAL: - gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; + case COLORING_SERIAL: gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_VB: case COLORING_VBBIT: case COLORING_VBCS: - typedef typename Impl::GraphColor_VB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - VBGraphColoring; - gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); + typedef + typename Impl::GraphColor_VB + VBGraphColoring; + gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_VBD: case COLORING_VBDBIT: - typedef typename Impl::GraphColor_VBD< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> + typedef typename Impl::GraphColor_VBD VBDGraphColoring; - gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); + gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_EB: - typedef typename Impl::GraphColor_EB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - EBGraphColoring; - gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); + typedef + typename Impl::GraphColor_EB + EBGraphColoring; + gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, gch); break; case COLORING_DEFAULT: break; diff --git a/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/graph/impl/KokkosGraph_Distance2Color_impl.hpp index 58b6d79ebb..cfa5186283 100644 --- a/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -53,8 +53,7 @@ namespace Impl { * Distance-1 conflicts will not be checked. * */ -template +template class GraphColorDistance2 { // Need mutable entries type for edge filtering using nc_entries_t = typename entries_t::non_const_type; @@ -109,9 +108,8 @@ class GraphColorDistance2 { * \param handle: GraphColoringHandle object that holds the specification * about the graph coloring, including parameters. */ - GraphColorDistance2(lno_t nr_, lno_t nc_, rowmap_t row_map, entries_t entries, - rowmap_t t_row_map, entries_t t_entries, - HandleType* handle) + GraphColorDistance2(lno_t nr_, lno_t nc_, rowmap_t row_map, entries_t entries, rowmap_t t_row_map, + entries_t t_entries, HandleType* handle) : nr(nr_), nc(nc_), ne(entries.extent(0)), @@ -163,9 +161,8 @@ class GraphColorDistance2 { case COLORING_D2_NB_BIT: compute_d2_coloring_nb(colors_out); break; case COLORING_D2_SERIAL: compute_d2_coloring_serial(colors_out); break; default: - throw std::runtime_error( - std::string("D2 coloring handle has invalid algorithm: ") + - std::to_string((int)this->gc_handle->get_coloring_algo_type())); + throw std::runtime_error(std::string("D2 coloring handle has invalid algorithm: ") + + std::to_string((int)this->gc_handle->get_coloring_algo_type())); } } @@ -179,16 +176,11 @@ class GraphColorDistance2 { // adjacency list ) if (this->_ticToc) { std::cout << "\tcolor_graph_d2 params:" << std::endl - << "\t algorithm : " - << this->gc_handle->getD2AlgorithmName() << std::endl - << "\t ticToc : " << this->_ticToc - << std::endl - << "\t max_num_iterations : " - << this->_max_num_iterations << std::endl - << "\t chunkSize : " << this->_chunkSize - << std::endl - << "\t Edge Filtering Pass? : " - << (int)using_edge_filtering << std::endl + << "\t algorithm : " << this->gc_handle->getD2AlgorithmName() << std::endl + << "\t ticToc : " << this->_ticToc << std::endl + << "\t max_num_iterations : " << this->_max_num_iterations << std::endl + << "\t chunkSize : " << this->_chunkSize << std::endl + << "\t Edge Filtering Pass? : " << (int)using_edge_filtering << std::endl << "\tgraph information:" << std::endl << "\t nr : " << this->nr << std::endl << "\t ne : " << this->ne << std::endl; @@ -203,9 +195,7 @@ class GraphColorDistance2 { // conflictlist - store conflicts that can happen when we're coloring in // parallel. - lno_view_t current_vertexList( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), - this->nr); + lno_view_t current_vertexList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), this->nr); lno_t current_vertexListLength = this->nr; @@ -215,13 +205,10 @@ class GraphColorDistance2 { current_vertexListLength = this->gc_handle->get_vertex_list_size(); } else { // init conflictlist sequentially. - Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), - functorInitList(current_vertexList)); + Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList(current_vertexList)); } // Next iteratons's conflictList - lno_view_t next_iteration_recolorList( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), - this->nr); + lno_view_t next_iteration_recolorList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), this->nr); // Size the next iteration conflictList single_lno_view_t next_iteration_recolorListLength("recolorListLength"); @@ -251,15 +238,11 @@ class GraphColorDistance2 { // entries_t, // so that it has the same type as adj // * on the other hand, t_adj is not actually modified by EF functor - lno_view_t adj_copy( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), - this->ne); + lno_view_t adj_copy(Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), this->ne); Kokkos::deep_copy(adj_copy, this->adj); - this->colorGreedyEF(this->xadj, adj_copy, this->t_xadj, this->t_adj, - colors_out); + this->colorGreedyEF(this->xadj, adj_copy, this->t_xadj, this->t_adj, colors_out); } else { - this->colorGreedy(this->xadj, this->adj, this->t_xadj, this->t_adj, - colors_out, current_vertexList, + this->colorGreedy(this->xadj, this->adj, this->t_xadj, this->t_adj, colors_out, current_vertexList, current_vertexListLength); } @@ -269,10 +252,8 @@ class GraphColorDistance2 { time = timer.seconds(); total_time += time; std::cout << "\tIteration: " << iter << std::endl - << "\t - Time speculative greedy phase : " << time - << std::endl - << "\t - Num Uncolored (greedy-color) : " << numUncolored - << std::endl; + << "\t - Time speculative greedy phase : " << time << std::endl + << "\t - Num Uncolored (greedy-color) : " << numUncolored << std::endl; gc_handle->add_to_overall_coloring_time_phase1(time); @@ -289,20 +270,17 @@ class GraphColorDistance2 { // NOTE: not using colorset algorithm in this so we don't include colorset // data - numUncolored = this->findConflicts( - swap_work_arrays, this->xadj, this->adj, this->t_xadj, this->t_adj, - colors_out, current_vertexList, current_vertexListLength, - next_iteration_recolorList, next_iteration_recolorListLength); + numUncolored = this->findConflicts(swap_work_arrays, this->xadj, this->adj, this->t_xadj, this->t_adj, colors_out, + current_vertexList, current_vertexListLength, next_iteration_recolorList, + next_iteration_recolorListLength); execution_space().fence(); if (_ticToc) { time = timer.seconds(); total_time += time; - std::cout << "\t - Time conflict detection : " << time - << std::endl; - std::cout << "\t - Num Uncolored (conflicts) : " << numUncolored - << std::endl; + std::cout << "\t - Time conflict detection : " << time << std::endl; + std::cout << "\t - Num Uncolored (conflicts) : " << numUncolored << std::endl; gc_handle->add_to_overall_coloring_time_phase2(time); timer.reset(); } @@ -315,9 +293,8 @@ class GraphColorDistance2 { current_vertexList = next_iteration_recolorList; next_iteration_recolorList = temp; - current_vertexListLength = numUncolored; - next_iteration_recolorListLength = - single_lno_view_t("recolorListLength"); + current_vertexListLength = numUncolored; + next_iteration_recolorListLength = single_lno_view_t("recolorListLength"); } } @@ -331,8 +308,7 @@ class GraphColorDistance2 { // clean up in serial (resolveConflictsSerial) // ------------------------------------------ if (numUncolored > 0) { - this->resolveConflictsSerial(this->xadj, this->adj, this->t_xadj, - this->t_adj, colors_out, current_vertexList, + this->resolveConflictsSerial(this->xadj, this->adj, this->t_xadj, this->t_adj, colors_out, current_vertexList, current_vertexListLength); } @@ -341,10 +317,8 @@ class GraphColorDistance2 { if (_ticToc) { time = timer.seconds(); total_time += time; - std::cout << "\tTime serial conflict resolution : " << time - << std::endl; - std::cout << "\tTotal time for coloring : " << total_time - << std::endl; + std::cout << "\tTime serial conflict resolution : " << time << std::endl; + std::cout << "\tTotal time for coloring : " << total_time << std::endl; gc_handle->add_to_overall_coloring_time_phase3(time); } @@ -356,11 +330,9 @@ class GraphColorDistance2 { template struct NB_Coloring { - NB_Coloring(const lno_view_t& worklist_, const single_lno_view_t& worklen_, - color_type colorBase_, const forbidden_view& forbidden_, - color_view_type colors_, const rowmap_t& Vrowmap_, - const entries_t& Vcolinds_, lno_t vertsPerThread_, - lno_t numCols_) + NB_Coloring(const lno_view_t& worklist_, const single_lno_view_t& worklen_, color_type colorBase_, + const forbidden_view& forbidden_, color_view_type colors_, const rowmap_t& Vrowmap_, + const entries_t& Vcolinds_, lno_t vertsPerThread_, lno_t numCols_) : worklist(worklist_), worklen(worklen_), colorBase(colorBase_), @@ -387,8 +359,7 @@ class GraphColorDistance2 { for (size_type j = rowBegin; j < rowEnd; j++) { lno_t nei = Vcolinds(j); if (nei < numCols) { - for (int b = 0; b < batch; b++) - forbid[b] |= forbidden(nei * batch + b); + for (int b = 0; b < batch; b++) forbid[b] |= forbidden(nei * batch + b); } } // Find the first 0 bit in forbid @@ -405,27 +376,22 @@ class GraphColorDistance2 { break; } } - if (color && (colors(v) == 0 || colors(v) == CONFLICTED || - colors(v) == UNCOLORABLE)) { + if (color && (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE)) { // Color v colors(v) = color; if (!doing_bipartite) { // Update forbidden for v (preventing dist-1 conflicts) - if (v < numCols) - Kokkos::atomic_fetch_or(&forbidden(v * batch + colorWord), - (uint32_t)1 << colorBit); + if (v < numCols) Kokkos::atomic_fetch_or(&forbidden(v * batch + colorWord), (uint32_t)1 << colorBit); } // Update forbidden for all of v's neighbors for (size_type j = rowBegin; j < rowEnd; j++) { lno_t nei = Vcolinds(j); if (nei < numCols) { // Update column forbidden - Kokkos::atomic_fetch_or(&forbidden(nei * batch + colorWord), - (uint32_t)1 << colorBit); + Kokkos::atomic_fetch_or(&forbidden(nei * batch + colorWord), (uint32_t)1 << colorBit); } } - } else if (colors(v) == 0 || colors(v) == CONFLICTED || - colors(v) == UNCOLORABLE) { + } else if (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE) { colors(v) = UNCOLORABLE; } } @@ -444,9 +410,8 @@ class GraphColorDistance2 { template struct NB_Conflict { - NB_Conflict(color_type colorBase_, const forbidden_view& forbidden_, - const color_view_type& colors_, const rowmap_t& Crowmap_, - const entries_t& Ccolinds_, lno_t numVerts_) + NB_Conflict(color_type colorBase_, const forbidden_view& forbidden_, const color_view_type& colors_, + const rowmap_t& Crowmap_, const entries_t& Ccolinds_, lno_t numVerts_) : colorBase(colorBase_), forbidden(forbidden_), colors(colors_), @@ -513,10 +478,8 @@ class GraphColorDistance2 { template struct NB_RefreshForbidden { - NB_RefreshForbidden(color_type colorBase_, const forbidden_view& forbidden_, - const color_view_type& colors_, - const rowmap_t& Crowmap_, const entries_t& Ccolinds_, - lno_t numVerts_) + NB_RefreshForbidden(color_type colorBase_, const forbidden_view& forbidden_, const color_view_type& colors_, + const rowmap_t& Crowmap_, const entries_t& Ccolinds_, lno_t numVerts_) : colorBase(colorBase_), colorEnd(colorBase + 32 * batch), forbidden(forbidden_), @@ -563,12 +526,11 @@ class GraphColorDistance2 { }; struct NB_Worklist { - NB_Worklist(const color_view_type colors_, const lno_view_t& worklist_, - const single_lno_view_t& worklen_, lno_t nr_) + NB_Worklist(const color_view_type colors_, const lno_view_t& worklist_, const single_lno_view_t& worklen_, + lno_t nr_) : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, bool finalPass) const { if (colors(v) == CONFLICTED) { if (finalPass) worklist(lnum) = v; lnum++; @@ -587,12 +549,11 @@ class GraphColorDistance2 { }; struct NB_UpdateBatch { - NB_UpdateBatch(const color_view_type& colors_, const lno_view_t& worklist_, - const single_lno_view_t& worklen_, lno_t nr_) + NB_UpdateBatch(const color_view_type& colors_, const lno_view_t& worklist_, const single_lno_view_t& worklen_, + lno_t nr_) : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, bool finalPass) const { if (colors(v) == UNCOLORABLE) { if (finalPass) worklist(lnum) = v; lnum++; @@ -630,8 +591,7 @@ class GraphColorDistance2 { Kokkos::deep_copy(worklen, this->nr); // init conflictlist sequentially. - Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), - functorInitList(worklist)); + Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList(worklist)); // Estimate the number of colors that will be needed // The algorithm can't use more colors than the max distance-2 degree, @@ -670,7 +630,7 @@ class GraphColorDistance2 { // for batch size while (currentWork) { lno_t vertsPerThread = 1; - lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread; + lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread; timer.reset(); // if still using this color set, refresh forbidden. // This avoids using too many colors, by relying on forbidden from @@ -681,26 +641,22 @@ class GraphColorDistance2 { case 1: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<1>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<1>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; case 2: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<2>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<2>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; case 4: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<4>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<4>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; case 8: Kokkos::parallel_for( "NB D2 Forbidden", range_policy_type(0, numCols), - NB_RefreshForbidden<8>(colorBase, forbidden, colors_out, - this->t_xadj, this->t_adj, numVerts)); + NB_RefreshForbidden<8>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); break; default:; } @@ -709,62 +665,46 @@ class GraphColorDistance2 { switch (batch) { case 1: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<1>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<1>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<1>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<1>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; case 2: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<2>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<2>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<2>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<2>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; case 4: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<4>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<4>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<4>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<4>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; case 8: timer.reset(); - Kokkos::parallel_for( - "NB D2 Coloring", range_policy_type(0, workBatches), - NB_Coloring<8>(worklist, worklen, colorBase, forbidden, - colors_out, this->xadj, this->adj, - vertsPerThread, numCols)); + Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches), + NB_Coloring<8>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, + this->adj, vertsPerThread, numCols)); colorTime += timer.seconds(); timer.reset(); - Kokkos::parallel_for( - "NB D2 Conflict Resolution", range_policy_type(0, numCols), - NB_Conflict<8>(colorBase, forbidden, colors_out, this->t_xadj, - this->t_adj, numVerts)); + Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols), + NB_Conflict<8>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts)); conflictTime += timer.seconds(); break; default: @@ -774,17 +714,15 @@ class GraphColorDistance2 { } timer.reset(); // Then build the next worklist - Kokkos::parallel_scan( - "NB D2 worklist", range_policy_type(0, numVerts), - NB_Worklist(colors_out, worklist, worklen, numVerts), currentWork); + Kokkos::parallel_scan("NB D2 worklist", range_policy_type(0, numVerts), + NB_Worklist(colors_out, worklist, worklen, numVerts), currentWork); worklistTime += timer.seconds(); timer.reset(); iter++; } // Will need to run with a different color base, so rebuild the work list - Kokkos::parallel_scan( - "NB D2 Worklist Rebuild", range_policy_type(0, numVerts), - NB_UpdateBatch(colors_out, worklist, worklen, numVerts)); + Kokkos::parallel_scan("NB D2 Worklist Rebuild", range_policy_type(0, numVerts), + NB_UpdateBatch(colors_out, worklist, worklen, numVerts)); Kokkos::deep_copy(currentWork, worklen); worklistTime += timer.seconds(); timer.reset(); @@ -802,9 +740,7 @@ class GraphColorDistance2 { std::cout << "Conflict: " << conflictTime << '\n'; std::cout << "Forbidden: " << forbiddenTime << '\n'; std::cout << "Worklist: " << worklistTime << '\n'; - std::cout << "** Total: " - << colorTime + conflictTime + forbiddenTime + worklistTime - << "\n\n"; + std::cout << "** Total: " << colorTime + conflictTime + forbiddenTime + worklistTime << "\n\n"; } if (this->_ticToc) { gc_handle->add_to_overall_coloring_time_phase1(timer.seconds()); @@ -838,8 +774,8 @@ class GraphColorDistance2 { Kokkos::View Vcolinds = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), this->adj); // Create worklist - Kokkos::View worklist( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Worklist"), this->nr); + Kokkos::View worklist(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Worklist"), + this->nr); int iter = 0; Kokkos::Timer timer; lno_t currentWork = this->nr; @@ -898,10 +834,8 @@ class GraphColorDistance2 { // GraphColorDistance2::colorGreedy() // // ----------------------------------------------------------------- - void colorGreedy(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, - entries_t t_adj_, color_view_type vertex_colors_, - lno_view_t current_vertexList_, - lno_t current_vertexListLength_) { + void colorGreedy(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, color_view_type vertex_colors_, + lno_view_t current_vertexList_, lno_t current_vertexListLength_) { lno_t chunkSize_ = this->_chunkSize; if (current_vertexListLength_ < 100 * chunkSize_) { @@ -917,11 +851,9 @@ class GraphColorDistance2 { // 3. [S] loop over vertex neighbors // 4. [S] loop over vertex neighbors of neighbors case COLORING_D2_VB: { - functorGreedyColorVB gc(this->nr, this->nc, xadj_, adj_, t_xadj_, - t_adj_, vertex_colors_, current_vertexList_, + functorGreedyColorVB gc(this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, current_vertexList_, current_vertexListLength_); - Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), - gc); + Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc); } break; // One level Perallelism, BIT Array for coloring @@ -930,11 +862,9 @@ class GraphColorDistance2 { // 3. [S] loop over vertex neighbors // 4. [S] loop over vertex neighbors of neighbors case COLORING_D2_VB_BIT: { - functorGreedyColorVB_BIT gc(this->nr, this->nc, xadj_, adj_, t_xadj_, - t_adj_, vertex_colors_, current_vertexList_, - current_vertexListLength_); - Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), - gc); + functorGreedyColorVB_BIT gc(this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, + current_vertexList_, current_vertexListLength_); + Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc); } break; default: @@ -950,8 +880,8 @@ class GraphColorDistance2 { // GraphColorDistance2::colorGreedyEF() // // ----------------------------------------------------------------- - void colorGreedyEF(rowmap_t xadj_, lno_view_t adj_copy_, rowmap_t t_xadj_, - entries_t t_adj_copy_, color_view_type vertex_colors_) { + void colorGreedyEF(rowmap_t xadj_, lno_view_t adj_copy_, rowmap_t t_xadj_, entries_t t_adj_copy_, + color_view_type vertex_colors_) { // Pick the right coloring algorithm to use based on which algorithm we're // using switch (this->gc_handle->get_coloring_algo_type()) { @@ -961,10 +891,8 @@ class GraphColorDistance2 { // 3. [S] loop over vertex neighbors // 4. [S] loop over vertex neighbors of neighbors case COLORING_D2_VB_BIT_EF: { - functorGreedyColorVB_BIT_EF gc(this->nr, this->nc, xadj_, adj_copy_, - t_xadj_, t_adj_copy_, vertex_colors_); - Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), - gc); + functorGreedyColorVB_BIT_EF gc(this->nr, this->nc, xadj_, adj_copy_, t_xadj_, t_adj_copy_, vertex_colors_); + Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc); // prettyPrint1DView(vertex_colors_, "COLORS_GC_VB_BIT",500); } break; @@ -980,23 +908,17 @@ class GraphColorDistance2 { // GraphColorDistance2::findConflicts() // // ----------------------------------------------------------------- - lno_t findConflicts(bool& swap_work_arrays, rowmap_t xadj_, entries_t adj_, - rowmap_t t_xadj_, entries_t t_adj_, - color_view_type vertex_colors_, - lno_view_t current_vertexList_, - lno_t current_vertexListLength_, - lno_view_t next_iteration_recolorList_, - single_lno_view_t next_iteration_recolorListLength_) { + lno_t findConflicts(bool& swap_work_arrays, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type vertex_colors_, lno_view_t current_vertexList_, lno_t current_vertexListLength_, + lno_view_t next_iteration_recolorList_, single_lno_view_t next_iteration_recolorListLength_) { swap_work_arrays = true; lno_t output_numUncolored = 0; - functorFindConflicts_Atomic conf( - this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, - current_vertexList_, next_iteration_recolorList_, - next_iteration_recolorListLength_); - Kokkos::parallel_reduce("FindConflicts", - range_policy_type(0, current_vertexListLength_), - conf, output_numUncolored); + functorFindConflicts_Atomic conf(this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, + current_vertexList_, next_iteration_recolorList_, + next_iteration_recolorListLength_); + Kokkos::parallel_reduce("FindConflicts", range_policy_type(0, current_vertexListLength_), conf, + output_numUncolored); return output_numUncolored; } // findConflicts (end) @@ -1005,9 +927,8 @@ class GraphColorDistance2 { // GraphColorDistance2::resolveConflictsSerial() // // ----------------------------------------------------------------- - void resolveConflictsSerial(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, - entries_t t_adj_, color_view_type vertex_colors_, - lno_view_t current_vertexList_, + void resolveConflictsSerial(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type vertex_colors_, lno_view_t current_vertexList_, size_type current_vertexListLength_) { color_type* forbidden = new color_type[nr]; for (lno_t i = 0; i < nr; i++) forbidden[i] = nr; @@ -1042,16 +963,14 @@ class GraphColorDistance2 { if (h_colors(vid) > 0) continue; // loop over distance-1 neighbors of vid - for (size_type vid_d1_adj = h_idx(vid); vid_d1_adj < h_idx(vid + 1); - vid_d1_adj++) { + for (size_type vid_d1_adj = h_idx(vid); vid_d1_adj < h_idx(vid + 1); vid_d1_adj++) { lno_t vid_d1 = h_adj(vid_d1_adj); if (vid_d1 < nc) { if (!doing_bipartite && vid_d1 != vid) { forbidden[h_colors(vid_d1)] = vid; } // loop over neighbors of vid_d1 (distance-2 from vid) - for (size_type vid_d2_adj = h_t_idx(vid_d1); - vid_d2_adj < h_t_idx(vid_d1 + 1); vid_d2_adj++) { + for (size_type vid_d2_adj = h_t_idx(vid_d1); vid_d2_adj < h_t_idx(vid_d1 + 1); vid_d2_adj++) { lno_t vid_d2 = h_t_adj(vid_d2_adj); // skip over loops vid -- x -- vid, and filter out-of-bounds @@ -1076,8 +995,7 @@ class GraphColorDistance2 { public: // pretty-print a 1D View with label template - void prettyPrint1DView(kokkos_view_t& view, const char* label, - const size_t max_entries = 500) const { + void prettyPrint1DView(kokkos_view_t& view, const char* label, const size_t max_entries = 500) const { int max_per_line = 20; int line_count = 1; std::cout << label << " = [ \n\t"; @@ -1132,10 +1050,8 @@ class GraphColorDistance2 { lno_t _vertexListLength; // lno_t _chunkSize; // - functorGreedyColorVB(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, - rowmap_t t_xadj_, entries_t t_adj_, - color_view_type colors, lno_view_t vertexList, - lno_t vertexListLength) + functorGreedyColorVB(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type colors, lno_view_t vertexList, lno_t vertexListLength) : nr(nr_), nc(nc_), _idx(xadj_), @@ -1173,15 +1089,13 @@ class GraphColorDistance2 { // but in distance-2 we'd need the total vertices at distance-2 which // we don't easily have aprioi. This could be as big as all the // vertices in the graph if diameter(G)=2... - for (color_type offset = 1; offset <= nr; - offset += VB_D2_COLORING_FORBIDDEN_SIZE) { + for (color_type offset = 1; offset <= nr; offset += VB_D2_COLORING_FORBIDDEN_SIZE) { // initialize for (int i = 0; i < VB_D2_COLORING_FORBIDDEN_SIZE; i++) { forbidden[i] = false; } // Check neighbors, fill forbidden array. - for (size_type vid_adj = vid_adj_begin; vid_adj < vid_adj_end; - vid_adj++) { + for (size_type vid_adj = vid_adj_begin; vid_adj < vid_adj_end; vid_adj++) { const lno_t vid_d1 = _adj(vid_adj); if (vid_d1 < nc) { if (!doing_bipartite) // note: compile-time branch (template @@ -1189,23 +1103,20 @@ class GraphColorDistance2 { { if (vid_d1 != vid) { const color_type c = _colors(vid_d1); - if ((c >= offset) && - (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { + if ((c >= offset) && (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { forbidden[c - offset] = true; } } } const size_type vid_d1_adj_begin = _t_idx(vid_d1); const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); - for (size_type vid_d1_adj = vid_d1_adj_begin; - vid_d1_adj < vid_d1_adj_end; vid_d1_adj++) { + for (size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; vid_d1_adj++) { const lno_t vid_d2 = _t_adj(vid_d1_adj); // Skip distance-2-self-loops if (vid_d2 != vid && vid_d2 < nr) { const color_type c = _colors(vid_d2); - if ((c >= offset) && - (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { + if ((c >= offset) && (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) { forbidden[c - offset] = true; } } @@ -1240,10 +1151,8 @@ class GraphColorDistance2 { lno_view_t _vertexList; // lno_t _vertexListLength; // - functorGreedyColorVB_BIT(lno_t nr_, lno_t nc_, rowmap_t xadj_, - entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, - color_view_type colors, lno_view_t vertexList, - lno_t vertexListLength) + functorGreedyColorVB_BIT(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_, + color_view_type colors, lno_view_t vertexList, lno_t vertexListLength) : nr(nr_), nc(nc_), _idx(xadj_), @@ -1270,8 +1179,7 @@ class GraphColorDistance2 { const size_type vid_adj_begin = _idx(vid); const size_type vid_adj_end = _idx(vid + 1); - for (color_type offset = 1; - offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); + for (color_type offset = 1; offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // Forbidden colors // - single long int for forbidden colors @@ -1282,8 +1190,7 @@ class GraphColorDistance2 { bool break_out = false; // Loop over distance-1 neighbors of vid - for (size_type vid_adj = vid_adj_begin; - !break_out && vid_adj < vid_adj_end; ++vid_adj) { + for (size_type vid_adj = vid_adj_begin; !break_out && vid_adj < vid_adj_end; ++vid_adj) { const lno_t vid_d1 = _adj(vid_adj); if (vid_d1 < nc) { if (!doing_bipartite) // note: compile-time branch (template @@ -1293,8 +1200,7 @@ class GraphColorDistance2 { if (vid_d1 != vid) { const color_type color = _colors(vid_d1); const color_type color_offset = color - offset; - if (color && - color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // if it is in the current range, then add the color to the // banned colors if (color > offset) { @@ -1313,8 +1219,7 @@ class GraphColorDistance2 { const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); // Loop over distance-2 neighbors of vid - for (size_type vid_d1_adj = vid_d1_adj_begin; - !break_out && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj) { + for (size_type vid_d1_adj = vid_d1_adj_begin; !break_out && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj) { const lno_t vid_d2 = _t_adj(vid_d1_adj); // Ignore Distance-2 Self Loops @@ -1324,8 +1229,7 @@ class GraphColorDistance2 { // if color is within the current range, or if its color is in // a previously traversed range - if (offset <= color && - color_offset < VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (offset <= color && color_offset < VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // if it is in the current range, then add the color to the // banned colors forbidden |= (bit_64_forbidden_type(1) << color_offset); @@ -1343,9 +1247,8 @@ class GraphColorDistance2 { // check if an available color exists. if (~forbidden) { - bit_64_forbidden_type color_offset = - KokkosKernels::Impl::least_set_bit(~forbidden) - 1; - _colors(vid) = offset + color_offset; + bit_64_forbidden_type color_offset = KokkosKernels::Impl::least_set_bit(~forbidden) - 1; + _colors(vid) = offset + color_offset; return; } } // for offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE) @@ -1366,16 +1269,9 @@ class GraphColorDistance2 { entries_t _t_adj; // transpose vertex adjacency list (NOT modified) color_view_type _colors; // vertex colors - functorGreedyColorVB_BIT_EF(lno_t nr_, lno_t nc_, rowmap_t xadj_, - lno_view_t adj_, rowmap_t t_xadj_, + functorGreedyColorVB_BIT_EF(lno_t nr_, lno_t nc_, rowmap_t xadj_, lno_view_t adj_, rowmap_t t_xadj_, entries_t t_adj_, color_view_type colors) - : _nr(nr_), - _nc(nc_), - _idx(xadj_), - _adj(adj_), - _t_idx(t_xadj_), - _t_adj(t_adj_), - _colors(colors) {} + : _nr(nr_), _nc(nc_), _idx(xadj_), _adj(adj_), _t_idx(t_xadj_), _t_adj(t_adj_), _colors(colors) {} // Color vertex i with smallest available color. // @@ -1394,8 +1290,7 @@ class GraphColorDistance2 { size_type vid_adj_end = _idx(vid + 1); bool foundColor = false; - for (color_type offset = 0; - !foundColor && offset <= (_nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); + for (color_type offset = 0; !foundColor && offset <= (_nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE); offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // Forbidden colors // - single long int for forbidden colors @@ -1406,8 +1301,7 @@ class GraphColorDistance2 { bool offset_colors_full = false; // Loop over distance-1 neighbors of vid - for (size_type vid_adj = vid_adj_begin; - !offset_colors_full && vid_adj < vid_adj_end; ++vid_adj) { + for (size_type vid_adj = vid_adj_begin; !offset_colors_full && vid_adj < vid_adj_end; ++vid_adj) { const lno_t vid_d1 = _adj(vid_adj); if (vid_d1 < _nc) { if (!doing_bipartite) // note: compile-time branch (template @@ -1419,21 +1313,20 @@ class GraphColorDistance2 { color_type color_offset = color - offset; // if color is within the current range, or if its color is in // a previously traversed range - if (color && offset < color && - color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (color && offset < color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { // if it is in the current range, then add the color to the // banned colors convert color to bit representation bit_64_forbidden_type ban_color_bit = 1; - ban_color_bit = ban_color_bit << (color_offset - 1); + ban_color_bit = ban_color_bit << (color_offset - 1); // add it to forbidden colors forbidden = forbidden | ban_color_bit; } } } - size_type vid_d1_adj_begin = _t_idx(vid_d1); - const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); - const size_type degree_vid_d1 = vid_d1_adj_end - vid_d1_adj_begin; + size_type vid_d1_adj_begin = _t_idx(vid_d1); + const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1); + const size_type degree_vid_d1 = vid_d1_adj_end - vid_d1_adj_begin; size_type num_vid_d2_colored_in_range = 0; // Store the maximum color value found in the vertices adjacent to @@ -1441,26 +1334,22 @@ class GraphColorDistance2 { color_type max_color_adj_to_d1 = 0; // Loop over distance-2 neighbors of vid - for (size_type vid_d1_adj = vid_d1_adj_begin; - !offset_colors_full && vid_d1_adj < vid_d1_adj_end; + for (size_type vid_d1_adj = vid_d1_adj_begin; !offset_colors_full && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj) { const lno_t vid_d2 = _t_adj(vid_d1_adj); // Ignore Distance-2 Self Loops if (vid_d2 != vid && vid_d2 < _nr) { - color_type color = _colors(vid_d2); - color_type color_offset = - color - offset; // color_offset < 0 means color is from a - // previous offset. + color_type color = _colors(vid_d2); + color_type color_offset = color - offset; // color_offset < 0 means color is from a + // previous offset. // Update maximum color adjacent to vid_d1 found so far. - max_color_adj_to_d1 = - color > max_color_adj_to_d1 ? color : max_color_adj_to_d1; + max_color_adj_to_d1 = color > max_color_adj_to_d1 ? color : max_color_adj_to_d1; // if color is within the current range, or if its color is in // a previously traversed range - if (color && - color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { + if (color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) { num_vid_d2_colored_in_range++; // if it is in the current range, then add the color to the @@ -1543,10 +1432,8 @@ class GraphColorDistance2 { lno_view_t _recolorList; single_lno_view_t _recolorListLength; - functorFindConflicts_Atomic(lno_t nr_, lno_t nc_, rowmap_t xadj_, - entries_t adj_, rowmap_t t_xadj_, - entries_t t_adj_, color_view_type colors, - lno_view_t vertexList, lno_view_t recolorList, + functorFindConflicts_Atomic(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_, + entries_t t_adj_, color_view_type colors, lno_view_t vertexList, lno_view_t recolorList, single_lno_view_t recolorListLength) : nr(nr_), nc(nc_), @@ -1566,8 +1453,7 @@ class GraphColorDistance2 { const size_type vid_d1_adj_begin = _idx(vid); const size_type vid_d1_adj_end = _idx(vid + 1); // If vid is a valid column (vid < nc), check for column->vert conflicts - for (size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; - vid_d1_adj++) { + for (size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; vid_d1_adj++) { lno_t vid_d1 = _adj(vid_d1_adj); if (vid_d1 < nc) { if (!doing_bipartite) // note: compile-time branch (template param) @@ -1576,8 +1462,7 @@ class GraphColorDistance2 { if (vid_d1 != vid && _colors(vid_d1) == my_color) { _colors(vid) = 0; // uncolor vertex // Atomically add vertex to recolorList - const lno_t k = - Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); + const lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); _recolorList(k) = vid; numConflicts++; return; @@ -1585,16 +1470,14 @@ class GraphColorDistance2 { } const size_type d2_adj_begin = _t_idx(vid_d1); const size_type d2_adj_end = _t_idx(vid_d1 + 1); - for (size_type vid_d2_adj = d2_adj_begin; vid_d2_adj < d2_adj_end; - vid_d2_adj++) { + for (size_type vid_d2_adj = d2_adj_begin; vid_d2_adj < d2_adj_end; vid_d2_adj++) { const lno_t vid_d2 = _t_adj(vid_d2_adj); if (vid != vid_d2 && vid_d2 < nr) { if (_colors(vid_d2) == my_color) { _colors(vid) = 0; // uncolor vertex // Atomically add vertex to recolorList - const lno_t k = - Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); + const lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1)); _recolorList(k) = vid; numConflicts++; return; @@ -1634,8 +1517,7 @@ class GraphColorDistance2 { * @return nothing */ template -void graph_print_distance2_color_histogram(KernelHandle* handle, - bool csv = false) { +void graph_print_distance2_color_histogram(KernelHandle* handle, bool csv = false) { using lno_view_t = typename KernelHandle::nnz_lno_temp_work_view_t; using lno_t = typename KernelHandle::nnz_lno_t; using execution_space = typename KernelHandle::HandleExecSpace; @@ -1647,11 +1529,8 @@ void graph_print_distance2_color_histogram(KernelHandle* handle, color_view_t colors = gch_d2->get_vertex_colors(); lno_t num_colors = gch_d2->get_num_colors(); lno_view_t histogram("histogram", num_colors + 1); - KokkosKernels::Impl::kk_get_histogram(colors.extent(0), - colors, histogram); - auto h_histogram = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), histogram); + KokkosKernels::Impl::kk_get_histogram(colors.extent(0), colors, histogram); + auto h_histogram = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), histogram); // note: both modes ignore color 0 in output, since we assume the coloring is // valid if (csv) { @@ -1661,8 +1540,7 @@ void graph_print_distance2_color_histogram(KernelHandle* handle, } std::cout << h_histogram(i); } else { - auto histogram_slice = Kokkos::subview( - histogram, std::make_pair((size_t)1, histogram.extent(0))); + auto histogram_slice = Kokkos::subview(histogram, std::make_pair((size_t)1, histogram.extent(0))); std::cout << "Distance-2 Color Histogram (1..N): " << std::endl; KokkosKernels::Impl::kk_print_1Dview(histogram_slice); std::cout << std::endl; diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index a359956a23..e39e1e7ad3 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -26,8 +26,7 @@ namespace KokkosGraph { namespace Impl { -template +template struct D2_MIS_RandomPriority { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -66,17 +65,14 @@ struct D2_MIS_RandomPriority { // adjacent to the column. // This counts up monotonically as vertices are eliminated (given status // OUT_SET) - rowStatus = status_view_t( - Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); - colStatus = status_view_t( - Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); + rowStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts); allWorklists = Kokkos::View( Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3); } struct RefreshRowStatus { - RefreshRowStatus(const status_view_t& rowStatus_, - const worklist_t& worklist_, lno_t nvBits_, int round) + RefreshRowStatus(const status_view_t& rowStatus_, const worklist_t& worklist_, lno_t nvBits_, int round) : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_) { hashedRound = KokkosKernels::Impl::xorshiftHash(round); } @@ -85,8 +81,8 @@ struct D2_MIS_RandomPriority { lno_t i = worklist(w); // Combine vertex and round to get some pseudorandom priority bits that // change each round - status_t priority = KokkosKernels::Impl::xorshiftHash( - KokkosKernels::Impl::xorshiftHash(i) ^ hashedRound); + status_t priority = + KokkosKernels::Impl::xorshiftHash(KokkosKernels::Impl::xorshiftHash(i) ^ hashedRound); // Generate unique status per row, with IN_SET < status < OUT_SET, status_t newStatus = (status_t)(i + 1) | (priority << nvBits); if (newStatus == OUT_SET) newStatus--; @@ -100,10 +96,8 @@ struct D2_MIS_RandomPriority { }; struct RefreshColStatus { - RefreshColStatus(const status_view_t& colStatus_, - const worklist_t& worklist_, - const status_view_t& rowStatus_, const rowmap_t& rowmap_, - const entries_t& entries_, lno_t nv_, lno_t worklistLen_) + RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, + const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, lno_t worklistLen_) : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), @@ -167,10 +161,8 @@ struct D2_MIS_RandomPriority { }; struct DecideSetFunctor { - DecideSetFunctor(const status_view_t& rowStatus_, - const status_view_t& colStatus_, const rowmap_t& rowmap_, - const entries_t& entries_, lno_t nv_, - const worklist_t& worklist_, lno_t worklistLen_) + DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, + const entries_t& entries_, lno_t nv_, const worklist_t& worklist_, lno_t worklistLen_) : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), @@ -275,8 +267,7 @@ struct D2_MIS_RandomPriority { struct CompactInSet { CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) : rowStatus(rowStatus_), setList(setList_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const { if (rowStatus(i) == IN_SET) { if (finalPass) setList(lNumInSet) = i; lNumInSet++; @@ -287,11 +278,9 @@ struct D2_MIS_RandomPriority { }; struct MaskedWorklist { - MaskedWorklist(const lno_view_t& mask_, const worklist_t& worklist_) - : mask(mask_), worklist(worklist_) {} + MaskedWorklist(const lno_view_t& mask_, const worklist_t& worklist_) : mask(mask_), worklist(worklist_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInList, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInList, bool finalPass) const { if (mask(i) < 0) { if (finalPass) worklist(lNumInList) = i; lNumInList++; @@ -302,12 +291,10 @@ struct D2_MIS_RandomPriority { }; struct CompactWorklistFunctor { - CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, - const status_view_t& status_) + CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_) : src(src_), dst(dst_), status(status_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const { lno_t i = src(w); status_t s = status(i); if (s != IN_SET && s != OUT_SET) { @@ -329,15 +316,12 @@ struct D2_MIS_RandomPriority { KokkosKernels::Impl::sequential_fill(rowWorklist); KokkosKernels::Impl::sequential_fill(colWorklist); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && - (entries.extent(0) / numVerts >= 16); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - numVerts, entries.extent(0), execSpaceEnum); - int round = 0; - lno_t rowWorkLen = numVerts; - lno_t colWorkLen = numVerts; + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); + int round = 0; + lno_t rowWorkLen = numVerts; + lno_t colWorkLen = numVerts; int refreshColTeamSize = 0; int decideSetTeamSize = 0; if (useTeams) { @@ -345,71 +329,54 @@ struct D2_MIS_RandomPriority { // Compute the recommended team size for RefreshColStatus and // DecideSetFunctor (will be constant) { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); - refreshColTeamSize = - dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); } { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); - decideSetTeamSize = - dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); } } while (true) { // Compute new row statuses - Kokkos::parallel_for( - range_pol(0, rowWorkLen), - RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); + Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); // Compute new col statuses { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); if (useTeams) - Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / - refreshColTeamSize, - refreshColTeamSize, vectorLength), - refreshCol); + Kokkos::parallel_for( + team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), + refreshCol); else Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol); } // Decide row statuses where enough information is available { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); if (useTeams) Kokkos::parallel_for( - team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, - decideSetTeamSize, vectorLength), + team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet); else Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet); } round++; // Compact row worklist - Kokkos::parallel_scan( - range_pol(0, rowWorkLen), - CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), - rowWorkLen); + Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), + rowWorkLen); if (rowWorkLen == 0) break; std::swap(rowWorklist, thirdWorklist); // Compact col worklist - Kokkos::parallel_scan( - range_pol(0, colWorkLen), - CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), - colWorkLen); + Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), + colWorkLen); std::swap(colWorklist, thirdWorklist); } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. lno_t numInSet = 0; - Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), - numInSet); - lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), - numInSet); - Kokkos::parallel_scan(range_pol(0, numVerts), - CompactInSet(rowStatus, setList)); + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); return setList; } @@ -422,20 +389,16 @@ struct D2_MIS_RandomPriority { lno_t rowWorkLen = numVerts; lno_t colWorkLen = numVerts; // Row worklist: initially only the non-masked vertices - Kokkos::parallel_scan(range_pol(0, numVerts), - MaskedWorklist(mask, rowWorklist), rowWorkLen); + Kokkos::parallel_scan(range_pol(0, numVerts), MaskedWorklist(mask, rowWorklist), rowWorkLen); KokkosKernels::Impl::sequential_fill(colWorklist); // Need to fill rowStatus with OUT_SET initially so that vertices not in the // worklist don't affect algorithm Kokkos::deep_copy(rowStatus, ~(status_t(0))); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && - (entries.extent(0) / numVerts >= 16); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - numVerts, entries.extent(0), execSpaceEnum); - int round = 0; + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); + int round = 0; int refreshColTeamSize = 0; int decideSetTeamSize = 0; if (useTeams) { @@ -443,71 +406,54 @@ struct D2_MIS_RandomPriority { // Compute the recommended team size for RefreshColStatus and // DecideSetFunctor (will be constant) { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); - refreshColTeamSize = - dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); + refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag()); } { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); - decideSetTeamSize = - dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); + decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag()); } } while (true) { // Compute new row statuses - Kokkos::parallel_for( - range_pol(0, rowWorkLen), - RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); + Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round)); // Compute new col statuses { - RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, - entries, numVerts, colWorkLen); + RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen); if (useTeams) - Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / - refreshColTeamSize, - refreshColTeamSize, vectorLength), - refreshCol); + Kokkos::parallel_for( + team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), + refreshCol); else Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol); } // Decide row statuses where enough information is available { - DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, - numVerts, rowWorklist, rowWorkLen); + DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen); if (useTeams) Kokkos::parallel_for( - team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, - decideSetTeamSize, vectorLength), + team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet); else Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet); } round++; // Compact row worklist - Kokkos::parallel_scan( - range_pol(0, rowWorkLen), - CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), - rowWorkLen); + Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), + rowWorkLen); if (rowWorkLen == 0) break; std::swap(rowWorklist, thirdWorklist); // Compact col worklist - Kokkos::parallel_scan( - range_pol(0, colWorkLen), - CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), - colWorkLen); + Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), + colWorkLen); std::swap(colWorklist, thirdWorklist); } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. lno_t numInSet = 0; - Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), - numInSet); - lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), - numInSet); - Kokkos::parallel_scan(range_pol(0, numVerts), - CompactInSet(rowStatus, setList)); + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); return setList; } @@ -523,8 +469,7 @@ struct D2_MIS_RandomPriority { int nvBits; }; -template +template struct D2_MIS_FixedPriority { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -551,10 +496,8 @@ struct D2_MIS_FixedPriority { entries(entries_), numVerts(rowmap.extent(0) - 1), colUpdateBitset(numVerts), - worklist1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL1"), - numVerts), - worklist2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL2"), - numVerts) { + worklist1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL1"), numVerts), + worklist2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL2"), numVerts) { status_t i = numVerts + 1; nvBits = 0; while (i) { @@ -566,25 +509,19 @@ struct D2_MIS_FixedPriority { // adjacent to the column. // This counts up monotonically as vertices are eliminated (given status // OUT_SET) - rowStatus = status_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts); - colStatus = status_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts); - KokkosSparse::Impl::graph_min_max_degree( - rowmap, minDegree, maxDegree); + rowStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts); + colStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts); + KokkosSparse::Impl::graph_min_max_degree(rowmap, minDegree, maxDegree); // Compute row statuses Kokkos::parallel_for(range_pol(0, numVerts), - InitRowStatus(rowStatus, rowmap, numVerts, nvBits, - minDegree, maxDegree)); + InitRowStatus(rowStatus, rowmap, numVerts, nvBits, minDegree, maxDegree)); // Compute col statuses - Kokkos::parallel_for( - range_pol(0, numVerts), - InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts)); + Kokkos::parallel_for(range_pol(0, numVerts), InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts)); } struct InitRowStatus { - InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, - lno_t nv_, lno_t nvBits_, lno_t minDeg_, lno_t maxDeg_) + InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, lno_t nv_, lno_t nvBits_, lno_t minDeg_, + lno_t maxDeg_) : rowStatus(rowStatus_), rowmap(rowmap_), nv(nv_), @@ -605,8 +542,7 @@ struct D2_MIS_FixedPriority { status_t maxDegRange = (((status_t)1) << degBits) - 2; lno_t deg = rowmap(i + 1) - rowmap(i); float degScore = (float)(deg - minDeg) * invDegRange; - rowStatus(i) = - (status_t)(i + 1) + (((status_t)(degScore * maxDegRange)) << nvBits); + rowStatus(i) = (status_t)(i + 1) + (((status_t)(degScore * maxDegRange)) << nvBits); } status_view_t rowStatus; @@ -619,14 +555,9 @@ struct D2_MIS_FixedPriority { }; struct InitColStatus { - InitColStatus(const status_view_t& colStatus_, - const status_view_t& rowStatus_, const rowmap_t& rowmap_, + InitColStatus(const status_view_t& colStatus_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) - : colStatus(colStatus_), - rowStatus(rowStatus_), - rowmap(rowmap_), - entries(entries_), - nv(nv_) {} + : colStatus(colStatus_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_) {} KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const { // iterate over {i} union the neighbors of i, to find @@ -652,10 +583,8 @@ struct D2_MIS_FixedPriority { }; struct IterateStatusFunctor { - IterateStatusFunctor(const status_view_t& rowStatus_, - const status_view_t& colStatus_, - const rowmap_t& rowmap_, const entries_t& entries_, - lno_t nv_, const lno_view_t& worklist_, + IterateStatusFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, + const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_, const bitset_t& colUpdateBitset_) : rowStatus(rowStatus_), colStatus(colStatus_), @@ -715,15 +644,11 @@ struct D2_MIS_FixedPriority { }; struct UpdateWorklistFunctor { - UpdateWorklistFunctor(const status_view_t& rowStatus_, - const lno_view_t& oldWorklist_, + UpdateWorklistFunctor(const status_view_t& rowStatus_, const lno_view_t& oldWorklist_, const lno_view_t& newWorklist_) - : rowStatus(rowStatus_), - oldWorklist(oldWorklist_), - newWorklist(newWorklist_) {} + : rowStatus(rowStatus_), oldWorklist(oldWorklist_), newWorklist(newWorklist_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, bool finalPass) const { // processing row i lno_t i = oldWorklist(w); // Bit i will be set when it's decided IN_SET/OUT_SET. @@ -741,12 +666,10 @@ struct D2_MIS_FixedPriority { }; struct ColRefreshWorklist { - ColRefreshWorklist(const bitset_t& colUpdateBitset_, - const lno_view_t& refreshList_) + ColRefreshWorklist(const bitset_t& colUpdateBitset_, const lno_view_t& refreshList_) : colUpdateBitset(colUpdateBitset_), refreshList(refreshList_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, bool finalPass) const { if (colUpdateBitset.test(i)) { if (finalPass) { refreshList(lindex) = i; @@ -761,10 +684,8 @@ struct D2_MIS_FixedPriority { }; struct RefreshColStatus { - RefreshColStatus(const lno_view_t& worklist_, - const status_view_t& rowStatus_, - const status_view_t& colStatus_, const rowmap_t& rowmap_, - const entries_t& entries_, lno_t nv_) + RefreshColStatus(const lno_view_t& worklist_, const status_view_t& rowStatus_, const status_view_t& colStatus_, + const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_) : worklist(worklist_), rowStatus(rowStatus_), colStatus(colStatus_), @@ -812,8 +733,7 @@ struct D2_MIS_FixedPriority { struct CompactInSet { CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_) : rowStatus(rowStatus_), setList(setList_) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const { if (rowStatus(i) == IN_SET) { if (finalPass) setList(lNumInSet) = i; lNumInSet++; @@ -825,30 +745,22 @@ struct D2_MIS_FixedPriority { lno_view_t compute() { // Initialize first worklist to 0...numVerts - Kokkos::parallel_for(range_pol(0, numVerts), - InitWorklistFunctor(worklist1)); + Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; while (workRemain) { // do another iteration - Kokkos::parallel_for( - range_pol(0, workRemain), - IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, numVerts, - worklist1, colUpdateBitset)); + Kokkos::parallel_for(range_pol(0, workRemain), IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, + numVerts, worklist1, colUpdateBitset)); // And refresh the column statuses using the other worklist. lno_t colsToRefresh; - Kokkos::parallel_scan(range_pol(0, numVerts), - ColRefreshWorklist(colUpdateBitset, worklist2), - colsToRefresh); + Kokkos::parallel_scan(range_pol(0, numVerts), ColRefreshWorklist(colUpdateBitset, worklist2), colsToRefresh); Kokkos::parallel_for(range_pol(0, colsToRefresh), - RefreshColStatus(worklist2, rowStatus, colStatus, - rowmap, entries, numVerts)); + RefreshColStatus(worklist2, rowStatus, colStatus, rowmap, entries, numVerts)); // then build the next worklist with a scan. Also get the length of the // next worklist. lno_t newWorkRemain = 0; - Kokkos::parallel_scan( - range_pol(0, workRemain), - UpdateWorklistFunctor(rowStatus, worklist1, worklist2), - newWorkRemain); + Kokkos::parallel_scan(range_pol(0, workRemain), UpdateWorklistFunctor(rowStatus, worklist1, worklist2), + newWorkRemain); // Finally, flip the worklists std::swap(worklist1, worklist2); workRemain = newWorkRemain; @@ -856,12 +768,9 @@ struct D2_MIS_FixedPriority { // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. lno_t numInSet = 0; - Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), - numInSet); - lno_view_t setList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "D2MIS"), - numInSet); - Kokkos::parallel_scan(range_pol(0, numVerts), - CompactInSet(rowStatus, setList)); + Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet); + lno_view_t setList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "D2MIS"), numInSet); + Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList)); return setList; } @@ -883,8 +792,7 @@ struct D2_MIS_FixedPriority { lno_view_t worklist2; }; -template +template struct D2_MIS_Aggregation { using exec_space = typename device_t::execution_space; using mem_space = typename device_t::memory_space; @@ -904,15 +812,13 @@ struct D2_MIS_Aggregation { : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1), - labels(Kokkos::ViewAllocateWithoutInitializing("AggregateLabels"), - numVerts), + labels(Kokkos::ViewAllocateWithoutInitializing("AggregateLabels"), numVerts), roots("Root Status", numVerts) { Kokkos::deep_copy(labels, (lno_t)-1); } struct Phase1Functor { - Phase1Functor(lno_t numVerts__, const mis2_view& m1__, - const rowmap_t& rowmap__, const entries_t& entries__, + Phase1Functor(lno_t numVerts__, const mis2_view& m1__, const rowmap_t& rowmap__, const entries_t& entries__, const labels_t& labels__, const char_view_t& roots__) : numVerts_(numVerts__), m1_(m1__), @@ -943,21 +849,16 @@ struct D2_MIS_Aggregation { void createPrimaryAggregates() { // Compute an MIS-2 - D2_MIS_RandomPriority d2mis( - rowmap, entries); + D2_MIS_RandomPriority d2mis(rowmap, entries); mis2_view m1 = d2mis.compute(); // Construct initial aggregates using roots and all direct neighbors - Kokkos::parallel_for( - range_pol(0, m1.extent(0)), - Phase1Functor(numVerts, m1, rowmap, entries, labels, roots)); + Kokkos::parallel_for(range_pol(0, m1.extent(0)), Phase1Functor(numVerts, m1, rowmap, entries, labels, roots)); numAggs = m1.extent(0); } struct CandAggSizesFunctor { - CandAggSizesFunctor(lno_t numVerts__, const labels_t& m2__, - const rowmap_t& rowmap__, const entries_t& entries__, - const labels_t& labels__, - const labels_t& candAggSizes__) + CandAggSizesFunctor(lno_t numVerts__, const labels_t& m2__, const rowmap_t& rowmap__, const entries_t& entries__, + const labels_t& labels__, const labels_t& candAggSizes__) : numVerts_(numVerts__), m2_(m2__), rowmap_(rowmap__), @@ -988,11 +889,8 @@ struct D2_MIS_Aggregation { }; struct ChoosePhase2AggsFunctor { - ChoosePhase2AggsFunctor(lno_t numVerts__, lno_t numAggs__, - const labels_t& m2__, const rowmap_t& rowmap__, - const entries_t& entries__, - const labels_t& labels__, - const labels_t& candAggSizes__, + ChoosePhase2AggsFunctor(lno_t numVerts__, lno_t numAggs__, const labels_t& m2__, const rowmap_t& rowmap__, + const entries_t& entries__, const labels_t& labels__, const labels_t& candAggSizes__, const char_view_t& roots__) : numVerts_(numVerts__), numAggs_(numAggs__), @@ -1003,8 +901,7 @@ struct D2_MIS_Aggregation { candAggSizes_(candAggSizes__), roots_(roots__) {} - KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lid, - bool finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lid, bool finalPass) const { lno_t aggSize = candAggSizes_(i); if (aggSize < 3) return; if (finalPass) { @@ -1035,36 +932,27 @@ struct D2_MIS_Aggregation { }; void createSecondaryAggregates() { - labels_t candAggSizes( - Kokkos::ViewAllocateWithoutInitializing("Phase2 Candidate Agg Sizes"), - numVerts); + labels_t candAggSizes(Kokkos::ViewAllocateWithoutInitializing("Phase2 Candidate Agg Sizes"), numVerts); // Compute a new MIS-2 from only unaggregated nodes - D2_MIS_RandomPriority d2mis( - rowmap, entries); + D2_MIS_RandomPriority d2mis(rowmap, entries); labels_t m2 = d2mis.compute(labels); lno_t numCandRoots = m2.extent(0); // Compute the sizes of would-be aggregates. Kokkos::parallel_for(range_pol(0, numCandRoots), - CandAggSizesFunctor(numVerts, m2, rowmap, entries, - labels, candAggSizes)); + CandAggSizesFunctor(numVerts, m2, rowmap, entries, labels, candAggSizes)); // Now, filter out the candidate aggs which are big enough, and create those // aggregates. Using a scan for this assigns IDs deterministically (unlike // an atomic counter). lno_t numNewAggs = 0; - Kokkos::parallel_scan( - range_pol(0, numCandRoots), - ChoosePhase2AggsFunctor(numVerts, numAggs, m2, rowmap, entries, labels, - candAggSizes, roots), - numNewAggs); + Kokkos::parallel_scan(range_pol(0, numCandRoots), + ChoosePhase2AggsFunctor(numVerts, numAggs, m2, rowmap, entries, labels, candAggSizes, roots), + numNewAggs); numAggs += numNewAggs; } struct SizeAndConnectivityFunctor { - SizeAndConnectivityFunctor(lno_t numVerts__, const rowmap_t& rowmap__, - const entries_t& entries__, - const labels_t& labels__, - const labels_t& connectivities__, - const labels_t& aggSizes__) + SizeAndConnectivityFunctor(lno_t numVerts__, const rowmap_t& rowmap__, const entries_t& entries__, + const labels_t& labels__, const labels_t& connectivities__, const labels_t& aggSizes__) : numVerts_(numVerts__), rowmap_(rowmap__), entries_(entries__), @@ -1100,12 +988,9 @@ struct D2_MIS_Aggregation { }; struct AssignLeftoverFunctor { - AssignLeftoverFunctor(lno_t numVerts__, const rowmap_t& rowmap__, - const entries_t& entries__, const labels_t& labels__, - const labels_t& labelsOld__, - const labels_t& connectivities__, - const labels_t& aggSizes__, - const char_view_t& roots__) + AssignLeftoverFunctor(lno_t numVerts__, const rowmap_t& rowmap__, const entries_t& entries__, + const labels_t& labels__, const labels_t& labelsOld__, const labels_t& connectivities__, + const labels_t& aggSizes__, const char_view_t& roots__) : numVerts_(numVerts__), rowmap_(rowmap__), entries_(entries__), @@ -1167,8 +1052,7 @@ struct D2_MIS_Aggregation { // Priorities: adjacent to root > connect > size if (trackedRootAdj[k] > bestRootAdj || (trackedRootAdj[k] == bestRootAdj && - ((trackedConnect[k] > bestConnect) || - (trackedConnect[k] == bestConnect && s < bestSize)))) { + ((trackedConnect[k] > bestConnect) || (trackedConnect[k] == bestConnect && s < bestSize)))) { bestRootAdj = trackedRootAdj[k]; bestConnect = trackedConnect[k]; bestSize = s; @@ -1195,18 +1079,13 @@ struct D2_MIS_Aggregation { // neighboring aggregate. labels_t labelsOld("old", numVerts); Kokkos::deep_copy(labelsOld, labels); - labels_t connectivities(Kokkos::ViewAllocateWithoutInitializing("connect"), - numVerts); + labels_t connectivities(Kokkos::ViewAllocateWithoutInitializing("connect"), numVerts); labels_t aggSizes("Phase3 Agg Sizes", numAggs); - Kokkos::parallel_for( - range_pol(0, numVerts), - SizeAndConnectivityFunctor(numVerts, rowmap, entries, labels, - connectivities, aggSizes)); + Kokkos::parallel_for(range_pol(0, numVerts), + SizeAndConnectivityFunctor(numVerts, rowmap, entries, labels, connectivities, aggSizes)); // Now, join vertices to aggregates - Kokkos::parallel_for( - range_pol(0, numVerts), - AssignLeftoverFunctor(numVerts, rowmap, entries, labels, labelsOld, - connectivities, aggSizes, roots)); + Kokkos::parallel_for(range_pol(0, numVerts), AssignLeftoverFunctor(numVerts, rowmap, entries, labels, labelsOld, + connectivities, aggSizes, roots)); } // phase 2 creates new aggregates in between the initial MIS-2 neighborhoods. diff --git a/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp index 464880c932..dc0e802485 100644 --- a/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp +++ b/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp @@ -20,10 +20,8 @@ namespace KokkosGraph { namespace Impl { -template +template struct ExplicitGraphCoarsening { using exec_space = typename device_t::execution_space; using range_pol = Kokkos::RangePolicy; @@ -33,29 +31,23 @@ struct ExplicitGraphCoarsening { using const_bitset_t = Kokkos::ConstBitset; struct ClusterSizeFunctor { - ClusterSizeFunctor(const ordinal_view_t& counts_, - const labels_t& vertClusters_) + ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_) : counts(counts_), vertClusters(vertClusters_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - Kokkos::atomic_increment(&counts(vertClusters(i))); - } + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { Kokkos::atomic_increment(&counts(vertClusters(i))); } ordinal_view_t counts; labels_t vertClusters; }; struct FillClusterVertsFunctor { - FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, - const ordinal_view_t& clusterVerts_, - const labels_t& vertClusters_, - const ordinal_view_t& insertCounts_) + FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, + const labels_t& vertClusters_, const ordinal_view_t& insertCounts_) : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_) {} KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - lno_t cluster = vertClusters(i); - lno_t offset = clusterOffsets(cluster) + - Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); + lno_t cluster = vertClusters(i); + lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1); clusterVerts(offset) = i; } ordinal_view_t clusterOffsets; @@ -65,12 +57,9 @@ struct ExplicitGraphCoarsening { }; struct BuildCrossClusterMaskFunctor { - BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, - const fine_entries_t& colinds_, - const ordinal_view_t& clusterOffsets_, - const ordinal_view_t& clusterVerts_, - const labels_t& vertClusters_, - const bitset_t& mask_) + BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, + const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, + const labels_t& vertClusters_, const bitset_t& mask_) : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), @@ -106,13 +95,10 @@ struct ExplicitGraphCoarsening { // Try to insert the edge between cluster (team's cluster) and neighbor // (neighboring cluster) by inserting nei into the table. - KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, - int* table) const { + KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const { unsigned h = xorshiftHash(nei); for (unsigned i = h; i < h + 2; i++) { - if (Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], - cluster, nei)) - return true; + if (Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) return true; } return false; } @@ -127,40 +113,35 @@ struct ExplicitGraphCoarsening { // thread handles a cluster int* table = (int*)t.team_shmem().get_shmem(tableSize() * sizeof(int)); // mark every entry as cluster (self-loop) to represent free/empty - Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), - [&](const lno_t i) { table[i] = cluster; }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()), [&](const lno_t i) { table[i] = cluster; }); t.team_barrier(); // now, for each row belonging to the cluster, iterate through the // neighbors - Kokkos::parallel_for( - Kokkos::TeamThreadRange(t, clusterSize), [&](const lno_t i) { - lno_t row = clusterVerts(clusterOffsets(cluster) + i); - lno_t rowDeg = rowmap(row + 1) - rowmap(row); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), - [&](const lno_t j) { - lno_t nei = colinds(rowmap(row) + j); - // Remote neighbors are not included - if (nei >= numRows) return; - lno_t neiCluster = vertClusters(nei); - if (neiCluster != cluster) { - // Have a neighbor. Try to find it in the - // table. - if (!lookup(neiCluster, table)) { - // Not in the table. Try to insert it. - insert(cluster, neiCluster, table); - // Whether or not insertion succeeded, - // this is a cross-cluster edge possibly - // not seen before - mask.set(rowmap(row) + j); - } - } - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize), [&](const lno_t i) { + lno_t row = clusterVerts(clusterOffsets(cluster) + i); + lno_t rowDeg = rowmap(row + 1) - rowmap(row); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg), [&](const lno_t j) { + lno_t nei = colinds(rowmap(row) + j); + // Remote neighbors are not included + if (nei >= numRows) return; + lno_t neiCluster = vertClusters(nei); + if (neiCluster != cluster) { + // Have a neighbor. Try to find it in the + // table. + if (!lookup(neiCluster, table)) { + // Not in the table. Try to insert it. + insert(cluster, neiCluster, table); + // Whether or not insertion succeeded, + // this is a cross-cluster edge possibly + // not seen before + mask.set(rowmap(row) + j); + } + } + }); + }); } - size_t team_shmem_size(int /*teamSize*/) const { - return tableSize() * sizeof(int); - } + size_t team_shmem_size(int /*teamSize*/) const { return tableSize() * sizeof(int); } lno_t numRows; fine_rowmap_t rowmap; @@ -172,14 +153,10 @@ struct ExplicitGraphCoarsening { }; struct FillClusterEntriesFunctor { - FillClusterEntriesFunctor(const fine_rowmap_t& rowmap_, - const fine_entries_t& colinds_, - const coarse_rowmap_t& clusterRowmap_, - const coarse_entries_t& clusterEntries_, - const ordinal_view_t& clusterOffsets_, - const ordinal_view_t& clusterVerts_, - const labels_t& vertClusters_, - const bitset_t& edgeMask_) + FillClusterEntriesFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, + const coarse_rowmap_t& clusterRowmap_, const coarse_entries_t& clusterEntries_, + const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, + const labels_t& vertClusters_, const bitset_t& edgeMask_) : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), @@ -189,8 +166,7 @@ struct ExplicitGraphCoarsening { vertClusters(vertClusters_), edgeMask(edgeMask_) {} // Run this scan over entries in clusterVerts (reordered point rows) - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, - const bool& finalPass) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, const bool& finalPass) const { lno_t numRows = rowmap.extent(0) - 1; lno_t row = clusterVerts(i); size_type rowStart = rowmap(row); @@ -238,9 +214,8 @@ struct ExplicitGraphCoarsening { // Constructor just does the computation and outputs to coarseRowmap, // coarseEntries. - ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, - const fine_entries_t& fineEntries, - const labels_t& labels, lno_t numCoarseVerts) { + ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, + lno_t numCoarseVerts) { lno_t numFineVerts = fineRowmap.extent(0); if (numFineVerts <= 1) { coarseRowmap = coarse_rowmap_t(); @@ -249,54 +224,39 @@ struct ExplicitGraphCoarsening { } numFineVerts--; clusterOffsets = ordinal_view_t("Cluster offsets", numCoarseVerts + 1); - clusterVerts = ordinal_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster verts"), - numFineVerts); - Kokkos::parallel_for(range_pol(0, numFineVerts), - ClusterSizeFunctor(clusterOffsets, labels)); - KokkosKernels::Impl::exclusive_parallel_prefix_sum( - numCoarseVerts + 1, clusterOffsets); + clusterVerts = ordinal_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster verts"), numFineVerts); + Kokkos::parallel_for(range_pol(0, numFineVerts), ClusterSizeFunctor(clusterOffsets, labels)); + KokkosKernels::Impl::exclusive_parallel_prefix_sum(numCoarseVerts + 1, clusterOffsets); { - ordinal_view_t tempInsertCounts("Temporary cluster insert counts", - numCoarseVerts); + ordinal_view_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts); Kokkos::parallel_for(range_pol(0, numFineVerts), - FillClusterVertsFunctor(clusterOffsets, clusterVerts, - labels, tempInsertCounts)); + FillClusterVertsFunctor(clusterOffsets, clusterVerts, labels, tempInsertCounts)); } // Determine the set of edges (in the point graph) that cross between two // distinct clusters int vectorSize = KokkosKernels::Impl::kk_get_suggested_vector_size( - numFineVerts, fineEntries.extent(0), - KokkosKernels::Impl::kk_get_exec_space_type()); + numFineVerts, fineEntries.extent(0), KokkosKernels::Impl::kk_get_exec_space_type()); bitset_t crossClusterEdgeMask(fineEntries.extent(0)); size_type numClusterEdges; { - BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, - clusterOffsets, clusterVerts, - labels, crossClusterEdgeMask); - int sharedPerTeam = buildEdgeMask.team_shmem_size( - 0); // using team-size = 0 for since no per-thread shared is used. - int teamSize = KokkosKernels::Impl::get_suggested_team_size( - buildEdgeMask, vectorSize, sharedPerTeam, 0); + BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, clusterOffsets, clusterVerts, labels, + crossClusterEdgeMask); + int sharedPerTeam = + buildEdgeMask.team_shmem_size(0); // using team-size = 0 for since no per-thread shared is used. + int teamSize = + KokkosKernels::Impl::get_suggested_team_size(buildEdgeMask, vectorSize, sharedPerTeam, 0); Kokkos::parallel_for( - team_pol(numCoarseVerts, teamSize, vectorSize) - .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), + team_pol(numCoarseVerts, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask); numClusterEdges = crossClusterEdgeMask.count(); } - coarseRowmap = coarse_rowmap_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph rowmap"), - numCoarseVerts + 1); + coarseRowmap = + coarse_rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph rowmap"), numCoarseVerts + 1); coarseEntries = - coarse_entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, - "Cluster graph colinds"), - numClusterEdges); - Kokkos::parallel_scan( - range_pol(0, numFineVerts), - FillClusterEntriesFunctor(fineRowmap, fineEntries, coarseRowmap, - coarseEntries, clusterOffsets, clusterVerts, - labels, crossClusterEdgeMask)); + coarse_entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph colinds"), numClusterEdges); + Kokkos::parallel_scan(range_pol(0, numFineVerts), + FillClusterEntriesFunctor(fineRowmap, fineEntries, coarseRowmap, coarseEntries, + clusterOffsets, clusterVerts, labels, crossClusterEdgeMask)); } coarse_rowmap_t coarseRowmap; diff --git a/graph/impl/KokkosGraph_color_d1_spec.hpp b/graph/impl/KokkosGraph_color_d1_spec.hpp index 5d66240763..178fdd9182 100644 --- a/graph/impl/KokkosGraph_color_d1_spec.hpp +++ b/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -36,21 +36,17 @@ struct color_d1_eti_spec_avail { } // namespace Impl } // namespace KokkosGraph -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct color_d1_eti_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct color_d1_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -63,24 +59,19 @@ namespace Impl { /// \brief Implementation of KokkosGraph::graph_color (distance-1 greedy /// coloring) -template ::value> +template ::value> struct COLOR_D1 { - static void color_d1(KernelHandle *handle, - typename lno_view_t::non_const_value_type num_rows, - size_view_t rowmap, lno_view_t entries); + static void color_d1(KernelHandle *handle, typename lno_view_t::non_const_value_type num_rows, size_view_t rowmap, + lno_view_t entries); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct COLOR_D1 { - static void color_d1(KernelHandle *handle, - typename lno_view_t::non_const_value_type num_rows, - size_view_t rowmap, lno_view_t entries) { +struct COLOR_D1 { + static void color_d1(KernelHandle *handle, typename lno_view_t::non_const_value_type num_rows, size_view_t rowmap, + lno_view_t entries) { KokkosGraph::Impl::graph_color_impl(handle, num_rows, rowmap, entries); } }; @@ -90,34 +81,26 @@ struct COLOR_D1, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct COLOR_D1< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct COLOR_D1< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct COLOR_D1< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #endif diff --git a/graph/src/KokkosGraph_CoarsenConstruct.hpp b/graph/src/KokkosGraph_CoarsenConstruct.hpp index 28de59979e..8e1cce3ddb 100644 --- a/graph/src/KokkosGraph_CoarsenConstruct.hpp +++ b/graph/src/KokkosGraph_CoarsenConstruct.hpp @@ -31,8 +31,7 @@ namespace KokkosSparse { namespace Impl { -template +template struct SortLowDegreeCrsMatrixFunctor { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; @@ -40,27 +39,17 @@ struct SortLowDegreeCrsMatrixFunctor { using team_mem = typename Kokkos::TeamPolicy::member_type; using value_type = lno_t; - SortLowDegreeCrsMatrixFunctor(bool usingRangePol, const rowmap_t& _rowmap, - const entries_t& _entries, - const values_t& _values, - const lno_t _degreeLimit) - : rowmap(_rowmap), - entries(_entries), - values(_values), - degreeLimit(_degreeLimit) { + SortLowDegreeCrsMatrixFunctor(bool usingRangePol, const rowmap_t& _rowmap, const entries_t& _entries, + const values_t& _values, const lno_t _degreeLimit) + : rowmap(_rowmap), entries(_entries), values(_values), degreeLimit(_degreeLimit) { if (usingRangePol) { - entriesAux = - entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), - entries.extent(0)); - valuesAux = - values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"), - values.extent(0)); + entriesAux = entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), entries.extent(0)); + valuesAux = values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"), values.extent(0)); } // otherwise, aux arrays won't be allocated (sorting in place) } - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, - value_type& reducer) const { + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, value_type& reducer) const { size_type rowStart = rowmap(i); size_type rowEnd = rowmap(i + 1); lno_t rowNum = rowEnd - rowStart; @@ -71,13 +60,11 @@ struct SortLowDegreeCrsMatrixFunctor { // Radix sort requires unsigned keys for comparison using unsigned_lno_t = typename std::make_unsigned::type; KokkosKernels::SerialRadixSort2( - (unsigned_lno_t*)entries.data() + rowStart, - (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart, - valuesAux.data() + rowStart, rowNum); + (unsigned_lno_t*)entries.data() + rowStart, (unsigned_lno_t*)entriesAux.data() + rowStart, + values.data() + rowStart, valuesAux.data() + rowStart, rowNum); } - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t, - value_type& reducer) const { + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t, value_type& reducer) const { size_type i = t.league_rank(); size_type rowStart = rowmap(i); size_type rowEnd = rowmap(i + 1); @@ -86,8 +73,8 @@ struct SortLowDegreeCrsMatrixFunctor { Kokkos::single(Kokkos::PerTeam(t), [&]() { reducer++; }); return; } - KokkosKernels::TeamBitonicSort2( - entries.data() + rowStart, values.data() + rowStart, rowNum, t); + KokkosKernels::TeamBitonicSort2(entries.data() + rowStart, + values.data() + rowStart, rowNum, t); } rowmap_t rowmap; @@ -103,23 +90,19 @@ struct SortLowDegreeCrsMatrixFunctor { // Sort a CRS matrix: within each row, sort entries ascending by column. // At the same time, permute the values. // Only modifies rows below the degreeLimit -template +template typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix( const rowmap_t& rowmap, const entries_t& entries, const values_t& values, const typename entries_t::non_const_value_type degreeLimit) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); - Impl::SortLowDegreeCrsMatrixFunctor - funct(useRadix, rowmap, entries, values, degreeLimit); + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + Impl::SortLowDegreeCrsMatrixFunctor funct(useRadix, rowmap, entries, + values, degreeLimit); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; lno_t notSorted = 0; if (useRadix) { - Kokkos::parallel_reduce("sort_crs_matrix", - Kokkos::RangePolicy(0, numRows), - funct, notSorted); + Kokkos::parallel_reduce("sort_crs_matrix", Kokkos::RangePolicy(0, numRows), funct, notSorted); } else { // Try to get teamsize to be largest power of 2 not greater than avg entries // per row @@ -136,10 +119,8 @@ typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix( teamSize *= 2; } team_pol temp(numRows, teamSize); - teamSize = std::min(teamSize, - temp.team_size_max(funct, Kokkos::ParallelReduceTag())); - Kokkos::parallel_reduce("sort_crs_matrix", team_pol(numRows, teamSize), - funct, notSorted); + teamSize = std::min(teamSize, temp.team_size_max(funct, Kokkos::ParallelReduceTag())); + Kokkos::parallel_reduce("sort_crs_matrix", team_pol(numRows, teamSize), funct, notSorted); } return notSorted; } @@ -156,30 +137,27 @@ template class coarse_builder { public: // define internal types - using matrix_t = crsMat; - using exec_space = typename matrix_t::execution_space; - using mem_space = typename matrix_t::memory_space; - using Device = typename matrix_t::device_type; - using ordinal_t = typename matrix_t::ordinal_type; - using edge_offset_t = typename matrix_t::size_type; - using scalar_t = typename matrix_t::value_type; - using vtx_view_t = Kokkos::View; - using wgt_view_t = Kokkos::View; - using edge_view_t = Kokkos::View; - using edge_subview_t = Kokkos::View; - using graph_type = typename matrix_t::staticcrsgraph_type; - using policy_t = Kokkos::RangePolicy; - using dyn_policy_t = - Kokkos::RangePolicy, exec_space>; - using team_policy_t = Kokkos::TeamPolicy; - using dyn_team_policy_t = - Kokkos::TeamPolicy, exec_space>; - using member = typename team_policy_t::member_type; - using spgemm_kernel_handle = KokkosKernels::Experimental::KokkosKernelsHandle< - edge_offset_t, ordinal_t, scalar_t, exec_space, mem_space, mem_space>; - using uniform_memory_pool_t = - KokkosKernels::Impl::UniformMemoryPool; - using mapper_t = coarsen_heuristics; + using matrix_t = crsMat; + using exec_space = typename matrix_t::execution_space; + using mem_space = typename matrix_t::memory_space; + using Device = typename matrix_t::device_type; + using ordinal_t = typename matrix_t::ordinal_type; + using edge_offset_t = typename matrix_t::size_type; + using scalar_t = typename matrix_t::value_type; + using vtx_view_t = Kokkos::View; + using wgt_view_t = Kokkos::View; + using edge_view_t = Kokkos::View; + using edge_subview_t = Kokkos::View; + using graph_type = typename matrix_t::staticcrsgraph_type; + using policy_t = Kokkos::RangePolicy; + using dyn_policy_t = Kokkos::RangePolicy, exec_space>; + using team_policy_t = Kokkos::TeamPolicy; + using dyn_team_policy_t = Kokkos::TeamPolicy, exec_space>; + using member = typename team_policy_t::member_type; + using spgemm_kernel_handle = KokkosKernels::Experimental::KokkosKernelsHandle; + using uniform_memory_pool_t = KokkosKernels::Impl::UniformMemoryPool; + using mapper_t = coarsen_heuristics; static constexpr ordinal_t get_null_val() { // this value must line up with the null value used by the hashmap // accumulator @@ -189,10 +167,9 @@ class coarse_builder { return std::numeric_limits::max(); } } - static constexpr ordinal_t ORD_MAX = get_null_val(); - static constexpr bool is_host_space = std::is_same< - typename exec_space::memory_space, - typename Kokkos::DefaultHostExecutionSpace::memory_space>::value; + static constexpr ordinal_t ORD_MAX = get_null_val(); + static constexpr bool is_host_space = + std::is_same::value; static constexpr bool scal_eq_ord = std::is_same::value; // contains matrix and vertex weights corresponding to current level // interp matrix maps previous level to this level @@ -222,9 +199,7 @@ class coarse_builder { }; // determine if dynamic scheduling should be used - static bool should_use_dyn( - const ordinal_t n, const Kokkos::View work, - int t_count) { + static bool should_use_dyn(const ordinal_t n, const Kokkos::View work, int t_count) { bool use_dyn = false; edge_offset_t max = 0; edge_offset_t min = std::numeric_limits::max(); @@ -252,19 +227,16 @@ class coarse_builder { // build the course graph according to ((B^T A) B) or (B^T (A B)), where B is // aggregator matrix - static coarse_level_triple build_coarse_graph_spgemm( - coarsen_handle& handle, const coarse_level_triple level, - const matrix_t interp_mtx) { + static coarse_level_triple build_coarse_graph_spgemm(coarsen_handle& handle, const coarse_level_triple level, + const matrix_t interp_mtx) { vtx_view_t f_vtx_w = level.vtx_wgts; matrix_t g = level.mtx; - if (!KokkosSparse::Impl::isCrsGraphSorted(g.graph.row_map, g.graph.entries)) - KokkosSparse::sort_crs_matrix(g); + if (!KokkosSparse::Impl::isCrsGraphSorted(g.graph.row_map, g.graph.entries)) KokkosSparse::sort_crs_matrix(g); ordinal_t n = g.numRows(); ordinal_t nc = interp_mtx.numCols(); - matrix_t interp_transpose = - KokkosSparse::Impl::transpose_matrix(interp_mtx); + matrix_t interp_transpose = KokkosSparse::Impl::transpose_matrix(interp_mtx); KokkosSparse::sort_crs_matrix(interp_transpose); spgemm_kernel_handle kh; @@ -278,78 +250,60 @@ class coarse_builder { if (handle.b == Spgemm_transpose_first) { kh.create_spgemm_handle(); edge_view_t row_map_p1("rows_partial", nc + 1); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, nc, n, n, interp_transpose.graph.row_map, - interp_transpose.graph.entries, false, g.graph.row_map, - g.graph.entries, false, row_map_p1); + KokkosSparse::Experimental::spgemm_symbolic(&kh, nc, n, n, interp_transpose.graph.row_map, + interp_transpose.graph.entries, false, g.graph.row_map, + g.graph.entries, false, row_map_p1); // partial-result matrix - vtx_view_t entries_p1("adjacencies_partial", - kh.get_spgemm_handle()->get_c_nnz()); - wgt_view_t values_p1("weights_partial", - kh.get_spgemm_handle()->get_c_nnz()); + vtx_view_t entries_p1("adjacencies_partial", kh.get_spgemm_handle()->get_c_nnz()); + wgt_view_t values_p1("weights_partial", kh.get_spgemm_handle()->get_c_nnz()); KokkosSparse::Experimental::spgemm_numeric( - &kh, nc, n, n, interp_transpose.graph.row_map, - interp_transpose.graph.entries, interp_transpose.values, false, - g.graph.row_map, g.graph.entries, g.values, false, row_map_p1, - entries_p1, values_p1); + &kh, nc, n, n, interp_transpose.graph.row_map, interp_transpose.graph.entries, interp_transpose.values, false, + g.graph.row_map, g.graph.entries, g.values, false, row_map_p1, entries_p1, values_p1); kh.destroy_spgemm_handle(); row_map_coarse = edge_view_t("rows_coarse", nc + 1); kh.create_spgemm_handle(); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, nc, n, nc, row_map_p1, entries_p1, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, false, - row_map_coarse); + KokkosSparse::Experimental::spgemm_symbolic(&kh, nc, n, nc, row_map_p1, entries_p1, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, false, + row_map_coarse); // coarse-graph adjacency matrix - adj_coarse = - vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); - wgt_coarse = - wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); + adj_coarse = vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); + wgt_coarse = wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); - KokkosSparse::Experimental::spgemm_numeric( - &kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, - false, row_map_coarse, adj_coarse, wgt_coarse); + KokkosSparse::Experimental::spgemm_numeric(&kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, + false, row_map_coarse, adj_coarse, wgt_coarse); kh.destroy_spgemm_handle(); } else { edge_view_t row_map_p1("rows_partial", n + 1); kh.create_spgemm_handle(); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, n, n, nc, g.graph.row_map, g.graph.entries, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, false, - row_map_p1); + KokkosSparse::Experimental::spgemm_symbolic(&kh, n, n, nc, g.graph.row_map, g.graph.entries, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, false, + row_map_p1); // partial-result matrix - vtx_view_t entries_p1("adjacencies_partial", - kh.get_spgemm_handle()->get_c_nnz()); - wgt_view_t values_p1("weights_partial", - kh.get_spgemm_handle()->get_c_nnz()); + vtx_view_t entries_p1("adjacencies_partial", kh.get_spgemm_handle()->get_c_nnz()); + wgt_view_t values_p1("weights_partial", kh.get_spgemm_handle()->get_c_nnz()); - KokkosSparse::Experimental::spgemm_numeric( - &kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false, - interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, - false, row_map_p1, entries_p1, values_p1); + KokkosSparse::Experimental::spgemm_numeric(&kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, + false, row_map_p1, entries_p1, values_p1); kh.destroy_spgemm_handle(); row_map_coarse = edge_view_t("rows_coarse", nc + 1); kh.create_spgemm_handle(); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, nc, n, nc, interp_transpose.graph.row_map, - interp_transpose.graph.entries, false, row_map_p1, entries_p1, false, - row_map_coarse); + KokkosSparse::Experimental::spgemm_symbolic(&kh, nc, n, nc, interp_transpose.graph.row_map, + interp_transpose.graph.entries, false, row_map_p1, entries_p1, false, + row_map_coarse); // coarse-graph adjacency matrix - adj_coarse = - vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); - wgt_coarse = - wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); + adj_coarse = vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); + wgt_coarse = wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); KokkosSparse::Experimental::spgemm_numeric( - &kh, nc, n, nc, interp_transpose.graph.row_map, - interp_transpose.graph.entries, interp_transpose.values, false, - row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse, - wgt_coarse); + &kh, nc, n, nc, interp_transpose.graph.row_map, interp_transpose.graph.entries, interp_transpose.values, + false, row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse, wgt_coarse); kh.destroy_spgemm_handle(); } @@ -362,8 +316,7 @@ class coarse_builder { Kokkos::parallel_for( policy_t(0, nc), KOKKOS_LAMBDA(ordinal_t u) { - for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); - j++) { + for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); j++) { if (adj_coarse(j) != u) { nonLoops(u)++; } @@ -373,8 +326,7 @@ class coarse_builder { edge_view_t row_map_nonloop("nonloop row map", nc + 1); Kokkos::parallel_scan( - policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& update, - const bool final) { + policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& update, const bool final) { const edge_offset_t val_i = nonLoops(i); update += val_i; if (final) { @@ -394,8 +346,7 @@ class coarse_builder { Kokkos::parallel_for( policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t u) { - for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); - j++) { + for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); j++) { if (adj_coarse(j) != u) { edge_offset_t offset = row_map_nonloop(u) + nonLoops(u)++; entries_nonloop(offset) = adj_coarse(j); @@ -412,8 +363,7 @@ class coarse_builder { vtx_view_t c_vtx_w("coarse vtx weights", interp_mtx.numCols()); Kokkos::parallel_for( - "compute coarse vtx wgts", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i) { + "compute coarse vtx wgts", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) { ordinal_t u = interp_mtx.graph.entries(i); Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i)); }); @@ -431,12 +381,10 @@ class coarse_builder { vtx_view_t input; edge_view_t output; - prefix_sum(vtx_view_t _input, edge_view_t _output) - : input(_input), output(_output) {} + prefix_sum(vtx_view_t _input, edge_view_t _output) : input(_input), output(_output) {} KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_t i, edge_offset_t& update, - const bool final) const { + void operator()(const ordinal_t i, edge_offset_t& update, const bool final) const { const edge_offset_t val_i = input(i); update += val_i; if (final) { @@ -455,11 +403,8 @@ class coarse_builder { vtx_view_t dedupe_edge_count; ordinal_t degreeLimit; - functorDedupeLowDegreeAfterSort(edge_view_t _row_map, vtx_view_t _entries, - vtx_view_t _entriesOut, wgt_view_t _wgts, - wgt_view_t _wgtsOut, - vtx_view_t _dedupe_edge_count, - ordinal_t _degreeLimit_) + functorDedupeLowDegreeAfterSort(edge_view_t _row_map, vtx_view_t _entries, vtx_view_t _entriesOut, wgt_view_t _wgts, + wgt_view_t _wgtsOut, vtx_view_t _dedupe_edge_count, ordinal_t _degreeLimit_) : row_map(_row_map), entries(_entries), entriesOut(_entriesOut), @@ -477,31 +422,28 @@ class coarse_builder { if (degree > degreeLimit) { return; } - Kokkos::parallel_scan( - Kokkos::TeamThreadRange(thread, start, end), - [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { - if (i == start) { - update += 1; - } else if (entries(i) != entries(i - 1)) { - update += 1; - } - if (final) { - entriesOut(start + update - 1) = entries(i); - // requires that wgtsOut be initialized to 0 - Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); - if (i + 1 == end) { - dedupe_edge_count(u) = update; - } - } - }); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, start + dedupe_edge_count(u)), - [&](const edge_offset_t& i) { - entries(i) = entriesOut(i); - wgts(i) = wgtsOut(i); - }); - Kokkos::single(Kokkos::PerTeam(thread), - [&]() { thread_sum += dedupe_edge_count(u); }); + Kokkos::parallel_scan(Kokkos::TeamThreadRange(thread, start, end), + [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { + if (i == start) { + update += 1; + } else if (entries(i) != entries(i - 1)) { + update += 1; + } + if (final) { + entriesOut(start + update - 1) = entries(i); + // requires that wgtsOut be initialized to 0 + Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); + if (i + 1 == end) { + dedupe_edge_count(u) = update; + } + } + }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, start + dedupe_edge_count(u)), + [&](const edge_offset_t& i) { + entries(i) = entriesOut(i); + wgts(i) = wgtsOut(i); + }); + Kokkos::single(Kokkos::PerTeam(thread), [&]() { thread_sum += dedupe_edge_count(u); }); } KOKKOS_INLINE_FUNCTION @@ -536,8 +478,7 @@ class coarse_builder { wgt_view_t wgts, wgtsOut; vtx_view_t dedupe_edge_count; - functorDedupeAfterSort(edge_view_t _row_map, vtx_view_t _entries, - vtx_view_t _entriesOut, wgt_view_t _wgts, + functorDedupeAfterSort(edge_view_t _row_map, vtx_view_t _entries, vtx_view_t _entriesOut, wgt_view_t _wgts, wgt_view_t _wgtsOut, vtx_view_t _dedupe_edge_count) : row_map(_row_map), entries(_entries), @@ -551,25 +492,23 @@ class coarse_builder { ordinal_t u = thread.league_rank(); edge_offset_t start = row_map(u); edge_offset_t end = row_map(u + 1); - Kokkos::parallel_scan( - Kokkos::TeamThreadRange(thread, start, end), - [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { - if (i == start) { - update += 1; - } else if (entries(i) != entries(i - 1)) { - update += 1; - } - if (final) { - entriesOut(start + update - 1) = entries(i); - // requires that wgtsOut be initialized to 0 - Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); - if (i + 1 == end) { - dedupe_edge_count(u) = update; - } - } - }); - Kokkos::single(Kokkos::PerTeam(thread), - [&]() { thread_sum += dedupe_edge_count(u); }); + Kokkos::parallel_scan(Kokkos::TeamThreadRange(thread, start, end), + [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { + if (i == start) { + update += 1; + } else if (entries(i) != entries(i - 1)) { + update += 1; + } + if (final) { + entriesOut(start + update - 1) = entries(i); + // requires that wgtsOut be initialized to 0 + Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); + if (i + 1 == end) { + dedupe_edge_count(u) = update; + } + } + }); + Kokkos::single(Kokkos::PerTeam(thread), [&]() { thread_sum += dedupe_edge_count(u); }); } KOKKOS_INLINE_FUNCTION @@ -601,11 +540,10 @@ class coarse_builder { const wgt_view_t source_wgts; wgt_view_t target_wgts; - functorCollapseDirectedToUndirected( - const edge_view_t _source_row_map, const edge_view_t _target_row_map, - const vtx_view_t _source_edge_counts, vtx_view_t _target_edge_counts, - const vtx_view_t _source_destinations, vtx_view_t _target_destinations, - const wgt_view_t _source_wgts, wgt_view_t _target_wgts) + functorCollapseDirectedToUndirected(const edge_view_t _source_row_map, const edge_view_t _target_row_map, + const vtx_view_t _source_edge_counts, vtx_view_t _target_edge_counts, + const vtx_view_t _source_destinations, vtx_view_t _target_destinations, + const wgt_view_t _source_wgts, wgt_view_t _target_wgts) : source_row_map(_source_row_map), target_row_map(_target_row_map), source_edge_counts(_source_edge_counts), @@ -620,24 +558,18 @@ class coarse_builder { ordinal_t u = thread.league_rank(); edge_offset_t u_origin = source_row_map(u); edge_offset_t u_dest_offset = target_row_map(u); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, source_edge_counts(u)), - [&](const edge_offset_t u_idx) { - ordinal_t v = source_destinations(u_origin + u_idx); - scalar_t wgt = source_wgts(u_origin + u_idx); - edge_offset_t v_dest_offset = target_row_map(v); - edge_offset_t v_dest = - v_dest_offset + - Kokkos::atomic_fetch_add(&target_edge_counts(v), 1); - edge_offset_t u_dest = - u_dest_offset + - Kokkos::atomic_fetch_add(&target_edge_counts(u), 1); - - target_destinations(u_dest) = v; - target_wgts(u_dest) = wgt; - target_destinations(v_dest) = u; - target_wgts(v_dest) = wgt; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, source_edge_counts(u)), [&](const edge_offset_t u_idx) { + ordinal_t v = source_destinations(u_origin + u_idx); + scalar_t wgt = source_wgts(u_origin + u_idx); + edge_offset_t v_dest_offset = target_row_map(v); + edge_offset_t v_dest = v_dest_offset + Kokkos::atomic_fetch_add(&target_edge_counts(v), 1); + edge_offset_t u_dest = u_dest_offset + Kokkos::atomic_fetch_add(&target_edge_counts(u), 1); + + target_destinations(u_dest) = v; + target_wgts(u_dest) = wgt; + target_destinations(v_dest) = u; + target_wgts(v_dest) = wgt; + }); } }; @@ -654,14 +586,10 @@ class coarse_builder { vtx_view_t remaining; bool use_out; - functorHashmapAccumulator(edge_view_t _row_map, vtx_view_t _entries_in, - vtx_view_t _entries_out, wgt_view_t _wgts_in, - wgt_view_t _wgts_out, - vtx_view_t _dedupe_edge_count, - uniform_memory_pool_t _memory_pool, - const ordinal_t _hash_size, - const ordinal_t _max_hash_entries, - vtx_view_t _remaining, bool _use_out) + functorHashmapAccumulator(edge_view_t _row_map, vtx_view_t _entries_in, vtx_view_t _entries_out, + wgt_view_t _wgts_in, wgt_view_t _wgts_out, vtx_view_t _dedupe_edge_count, + uniform_memory_pool_t _memory_pool, const ordinal_t _hash_size, + const ordinal_t _max_hash_entries, vtx_view_t _remaining, bool _use_out) : row_map(_row_map), entries_in(_entries_in), entries_out(_entries_out), @@ -680,12 +608,10 @@ class coarse_builder { if (std::is_same::value) return 0; #endif #if defined(KOKKOS_ENABLE_OPENMP) - if (std::is_same::value) - return Kokkos::OpenMP::impl_hardware_thread_id(); + if (std::is_same::value) return Kokkos::OpenMP::impl_hardware_thread_id(); #endif #if defined(KOKKOS_ENABLE_THREADS) - if (std::is_same::value) - return Kokkos::Threads::impl_hardware_thread_id(); + if (std::is_same::value) return Kokkos::Threads::impl_hardware_thread_id(); #endif return row_index; } @@ -745,17 +671,15 @@ class coarse_builder { // Set pointer to hash values scalar_t* values = (scalar_t*)wgts_out.data() + row_map(idx); - KokkosKernels::Experimental::HashmapAccumulator< - hash_size_type, hash_key_type, hash_value_type, - KokkosKernels::Experimental::HashOpType::bitwiseAnd> - hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, - values); + KokkosKernels::Experimental::HashmapAccumulator + hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, values); for (edge_offset_t i = row_map(idx); i < row_map(idx + 1); i++) { ordinal_t key = entries_in(i); scalar_t value = wgts_in(i); - hash_map.sequential_insert_into_hash_mergeAdd_TrackHashes( - key, value, used_hash_size, used_hash_count, used_hash_indices); + hash_map.sequential_insert_into_hash_mergeAdd_TrackHashes(key, value, used_hash_size, used_hash_count, + used_hash_indices); }; // Reset the Begins values to -1 before releasing the memory pool chunk. @@ -797,8 +721,7 @@ class coarse_builder { // Acquire a chunk from the memory pool using a spin-loop. ptr_write = nullptr; while (nullptr == ptr_write) { - ptr_write = (volatile ordinal_t*)(memory_pool.allocate_chunk( - thread.league_rank())); + ptr_write = (volatile ordinal_t*)(memory_pool.allocate_chunk(thread.league_rank())); } }, ptr_temp); @@ -848,29 +771,23 @@ class coarse_builder { values = (scalar_t*)(ptr_temp); } - KokkosKernels::Experimental::HashmapAccumulator< - hash_size_type, hash_key_type, hash_value_type, - KokkosKernels::Experimental::HashOpType::bitwiseAnd> - hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, - values); - - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(thread, row_map(idx), row_map(idx + 1)), - [&](const edge_offset_t& i) { - ordinal_t key = entries_in(i); - scalar_t value = wgts_in(i); - // duplicate keys may be inserted simultaneously, this causes - // problems we must handle later - int r = - hash_map - .vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( - key, value, used_hash_size, used_hash_count, - used_hash_indices); - - // Check return code - if (r) { - } - }); + KokkosKernels::Experimental::HashmapAccumulator + hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, values); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(thread, row_map(idx), row_map(idx + 1)), + [&](const edge_offset_t& i) { + ordinal_t key = entries_in(i); + scalar_t value = wgts_in(i); + // duplicate keys may be inserted simultaneously, this causes + // problems we must handle later + int r = hash_map.vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( + key, value, used_hash_size, used_hash_count, used_hash_indices); + + // Check return code + if (r) { + } + }); thread.team_barrier(); // Reset the Begins values to -1 before releasing the memory pool chunk. @@ -879,72 +796,49 @@ class coarse_builder { // there can be duplicate key insertions (these are hopefully rare or else // performance will suffer) This did not work as a TeamThreadRange, don't // know why (possibly issues with atomic addition on write_idx) - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *used_hash_count), - [&](const ordinal_t& i) { - ordinal_t dirty_hash = used_hash_indices[i]; - - ordinal_t bucket = hash_begins[dirty_hash]; - - // ascending-key bubble-sort the linked list - // it really do be like that sometimes - ordinal_t end_inner = ORD_MAX; - while (end_inner != bucket) { - ordinal_t last_idx = bucket; - ordinal_t last_key = keys[last_idx]; - scalar_t last_val = values[last_idx]; - bool is_sorted = true; - // bubble-up - for (ordinal_t k = hash_nexts[bucket]; k != end_inner; - k = hash_nexts[k]) { - // swap - if (keys[k] < last_key) { - keys[last_idx] = keys[k]; - values[last_idx] = values[k]; - keys[k] = last_key; - values[k] = last_val; - is_sorted = false; - } - // increment last - last_key = keys[k]; - last_val = values[k]; - last_idx = k; - } - end_inner = last_idx; - if (is_sorted) { - // end the outer loop - end_inner = bucket; - } - } - ordinal_t key = keys[bucket]; - scalar_t val = values[bucket]; - ordinal_t last = bucket; - // merge linked list and write out - for (ordinal_t j = hash_nexts[bucket]; j != ORD_MAX; - j = hash_nexts[j]) { - if (keys[j] == key) { - val += values[j]; - } else { - ordinal_t write_at = - row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); - entries_out(write_at) = key; - if (use_out) { - // reuse wgts_in as scratch space because we are overwriting - // working memory if we use wgts_out - wgts_in(write_at) = val; - } else { - wgts_out(write_at) = val; - } - key = keys[j]; - val = values[j]; - } - hash_nexts[last] = ORD_MAX; - last = j; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *used_hash_count), [&](const ordinal_t& i) { + ordinal_t dirty_hash = used_hash_indices[i]; + + ordinal_t bucket = hash_begins[dirty_hash]; + + // ascending-key bubble-sort the linked list + // it really do be like that sometimes + ordinal_t end_inner = ORD_MAX; + while (end_inner != bucket) { + ordinal_t last_idx = bucket; + ordinal_t last_key = keys[last_idx]; + scalar_t last_val = values[last_idx]; + bool is_sorted = true; + // bubble-up + for (ordinal_t k = hash_nexts[bucket]; k != end_inner; k = hash_nexts[k]) { + // swap + if (keys[k] < last_key) { + keys[last_idx] = keys[k]; + values[last_idx] = values[k]; + keys[k] = last_key; + values[k] = last_val; + is_sorted = false; } - hash_nexts[last] = ORD_MAX; - // write out the final entry in linked list - ordinal_t write_at = - row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); + // increment last + last_key = keys[k]; + last_val = values[k]; + last_idx = k; + } + end_inner = last_idx; + if (is_sorted) { + // end the outer loop + end_inner = bucket; + } + } + ordinal_t key = keys[bucket]; + scalar_t val = values[bucket]; + ordinal_t last = bucket; + // merge linked list and write out + for (ordinal_t j = hash_nexts[bucket]; j != ORD_MAX; j = hash_nexts[j]) { + if (keys[j] == key) { + val += values[j]; + } else { + ordinal_t write_at = row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); entries_out(write_at) = key; if (use_out) { // reuse wgts_in as scratch space because we are overwriting @@ -953,17 +847,31 @@ class coarse_builder { } else { wgts_out(write_at) = val; } - hash_begins[dirty_hash] = ORD_MAX; - }); + key = keys[j]; + val = values[j]; + } + hash_nexts[last] = ORD_MAX; + last = j; + } + hash_nexts[last] = ORD_MAX; + // write out the final entry in linked list + ordinal_t write_at = row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); + entries_out(write_at) = key; + if (use_out) { + // reuse wgts_in as scratch space because we are overwriting + // working memory if we use wgts_out + wgts_in(write_at) = val; + } else { + wgts_out(write_at) = val; + } + hash_begins[dirty_hash] = ORD_MAX; + }); thread.team_barrier(); // need to copy from wgts_in to wgts_out if we used wgts_in as scratch // space if (use_out) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *write_idx), - [&](const ordinal_t& i) { - wgts_out(row_map(idx) + i) = wgts_in(row_map(idx) + i); - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *write_idx), + [&](const ordinal_t& i) { wgts_out(row_map(idx) + i) = wgts_in(row_map(idx) + i); }); } Kokkos::single(Kokkos::PerTeam(thread), [&]() { @@ -978,14 +886,11 @@ class coarse_builder { }; // functorHashmapAccumulator - static void getHashmapSizeAndCount( - coarsen_handle& handle, const ordinal_t n, - const ordinal_t remaining_count, vtx_view_t remaining, - vtx_view_t edges_per_source, ordinal_t& hash_size, ordinal_t& max_entries, - ordinal_t& mem_chunk_size, ordinal_t& mem_chunk_count) { + static void getHashmapSizeAndCount(coarsen_handle& handle, const ordinal_t n, const ordinal_t remaining_count, + vtx_view_t remaining, vtx_view_t edges_per_source, ordinal_t& hash_size, + ordinal_t& max_entries, ordinal_t& mem_chunk_size, ordinal_t& mem_chunk_count) { ordinal_t avg_entries = 0; - if (!is_host_space && - static_cast(remaining_count) / static_cast(n) > 0.01) { + if (!is_host_space && static_cast(remaining_count) / static_cast(n) > 0.01) { Kokkos::parallel_reduce( "calc average among remaining", policy_t(0, remaining_count), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) { @@ -1024,12 +929,11 @@ class coarse_builder { } // Determine memory chunk size for UniformMemoryPool - mem_chunk_size = hash_size; // for hash indices - mem_chunk_size += hash_size; // for hash begins - mem_chunk_size += - 3 * max_entries; // for hash nexts, keys, and values (unless scalar_t - // != ordinal_t, in which case memory is unused) - mem_chunk_size += 10; // for metadata + mem_chunk_size = hash_size; // for hash indices + mem_chunk_size += hash_size; // for hash begins + mem_chunk_size += 3 * max_entries; // for hash nexts, keys, and values (unless scalar_t + // != ordinal_t, in which case memory is unused) + mem_chunk_size += 10; // for metadata mem_chunk_count = exec_space().concurrency(); if (mem_chunk_count > remaining_count) { mem_chunk_count = remaining_count + 1; @@ -1037,34 +941,27 @@ class coarse_builder { if (!is_host_space) { // decrease number of mem_chunks to reduce memory usage if necessary - size_t mem_needed = static_cast(mem_chunk_count) * - static_cast(mem_chunk_size) * - sizeof(ordinal_t); + size_t mem_needed = + static_cast(mem_chunk_count) * static_cast(mem_chunk_size) * sizeof(ordinal_t); //~500MB size_t max_mem_allowed = handle.max_mem_allowed; if (mem_needed > max_mem_allowed) { size_t chunk_dif = mem_needed - max_mem_allowed; - chunk_dif = chunk_dif / - (static_cast(mem_chunk_size) * sizeof(ordinal_t)); + chunk_dif = chunk_dif / (static_cast(mem_chunk_size) * sizeof(ordinal_t)); chunk_dif++; mem_chunk_count -= chunk_dif; } } } - static void deduplicate_graph(coarsen_handle& handle, const ordinal_t n, - const bool use_team, - vtx_view_t edges_per_source, - vtx_view_t dest_by_source, - wgt_view_t wgt_by_source, - const edge_view_t source_bucket_offset, - edge_offset_t& gc_nedges) { + static void deduplicate_graph(coarsen_handle& handle, const ordinal_t n, const bool use_team, + vtx_view_t edges_per_source, vtx_view_t dest_by_source, wgt_view_t wgt_by_source, + const edge_view_t source_bucket_offset, edge_offset_t& gc_nedges) { if (handle.b == Hashmap || is_host_space) { ordinal_t remaining_count = n; vtx_view_t remaining("remaining vtx", n); Kokkos::parallel_for( - policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i) { remaining(i) = i; }); + policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) { remaining(i) = i; }); // deduplicate rows in phases starting with the small degree rows so we // can use small hashmaps increase the hashmap size each phase to the // necessary size for twice the average of remaining rows @@ -1076,12 +973,10 @@ class coarse_builder { do { // determine size for hashmap ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count; - getHashmapSizeAndCount(handle, n, remaining_count, remaining, - edges_per_source, hash_size, max_entries, + getHashmapSizeAndCount(handle, n, remaining_count, remaining, edges_per_source, hash_size, max_entries, mem_chunk_size, mem_chunk_count); // Create Uniform Initialized Memory Pool - KokkosKernels::Impl::PoolType pool_type = - KokkosKernels::Impl::ManyThread2OneChunk; + KokkosKernels::Impl::PoolType pool_type = KokkosKernels::Impl::ManyThread2OneChunk; if (is_host_space) { pool_type = KokkosKernels::Impl::OneThread2OneChunk; @@ -1089,29 +984,23 @@ class coarse_builder { bool use_dyn = should_use_dyn(n, source_bucket_offset, mem_chunk_count); - uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, - ORD_MAX, pool_type); + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, ORD_MAX, pool_type); - functorHashmapAccumulator hashmapAccumulator( - source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, - wgt_out, edges_per_source, memory_pool, hash_size, max_entries, - remaining, !scal_eq_ord); + functorHashmapAccumulator hashmapAccumulator(source_bucket_offset, dest_by_source, dest_by_source, + wgt_by_source, wgt_out, edges_per_source, memory_pool, hash_size, + max_entries, remaining, !scal_eq_ord); ordinal_t old_remaining_count = remaining_count; if (!is_host_space && max_entries >= 128) { - Kokkos::parallel_reduce("hashmap time", - team_policy_t(old_remaining_count, 1, 64), - hashmapAccumulator, remaining_count); + Kokkos::parallel_reduce("hashmap time", team_policy_t(old_remaining_count, 1, 64), hashmapAccumulator, + remaining_count); } else { if (use_dyn) { - Kokkos::parallel_reduce( - "hashmap time", - dyn_policy_t(0, old_remaining_count, Kokkos::ChunkSize(128)), - hashmapAccumulator, remaining_count); - } else { - Kokkos::parallel_reduce("hashmap time", - policy_t(0, old_remaining_count), + Kokkos::parallel_reduce("hashmap time", dyn_policy_t(0, old_remaining_count, Kokkos::ChunkSize(128)), hashmapAccumulator, remaining_count); + } else { + Kokkos::parallel_reduce("hashmap time", policy_t(0, old_remaining_count), hashmapAccumulator, + remaining_count); } } @@ -1120,8 +1009,7 @@ class coarse_builder { Kokkos::parallel_scan( "move remaining vertices", policy_t(0, old_remaining_count), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = remaining(i); if (edges_per_source(u) >= max_entries) { if (final) { @@ -1135,39 +1023,31 @@ class coarse_builder { } } while (remaining_count > 0); Kokkos::parallel_reduce( - policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { - sum += edges_per_source(i); - }, + policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { sum += edges_per_source(i); }, gc_nedges); if (!scal_eq_ord && !is_host_space) { Kokkos::deep_copy(wgt_by_source, wgt_out); } } else if (handle.b == Sort) { // sort the (implicit) crs matrix - KokkosSparse::sort_crs_matrix(source_bucket_offset, - dest_by_source, wgt_by_source); + KokkosSparse::sort_crs_matrix(source_bucket_offset, + dest_by_source, wgt_by_source); // combine adjacent entries that are equal if (use_team) { // thread team version wgt_view_t wgts_out("wgts after dedupe", wgt_by_source.extent(0)); vtx_view_t dest_out("dest after dedupe", dest_by_source.extent(0)); - functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, - dest_out, wgt_by_source, wgts_out, + functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, dest_out, wgt_by_source, wgts_out, edges_per_source); - Kokkos::parallel_reduce("deduplicated sorted", team_policy_t(n, 64), - deduper, gc_nedges); + Kokkos::parallel_reduce("deduplicated sorted", team_policy_t(n, 64), deduper, gc_nedges); Kokkos::deep_copy(wgt_by_source, wgts_out); Kokkos::deep_copy(dest_by_source, dest_out); } else { // no thread team version - functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, - dest_by_source, wgt_by_source, + functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, wgt_by_source, edges_per_source); - Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, - gc_nedges); + Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, gc_nedges); } } else if (handle.b == Hybrid) { @@ -1179,23 +1059,19 @@ class coarse_builder { ordinal_t limit = 128; // sort the (implicit) crs matrix, but only the low degree rows ordinal_t remaining_count = - KokkosSparse::sort_low_degree_rows_crs_matrix( + KokkosSparse::sort_low_degree_rows_crs_matrix( source_bucket_offset, dest_by_source, wgt_by_source, limit); // combine adjacent entries that are equal { // no thread team version - functorDedupeLowDegreeAfterSort deduper( - source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, - wgt_out, edges_per_source, limit); - Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, - gc_nedges); + functorDedupeLowDegreeAfterSort deduper(source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, + wgt_out, edges_per_source, limit); + Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, gc_nedges); } vtx_view_t remaining("remaining vtx", remaining_count); Kokkos::parallel_scan( "move remaining vertices", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { if (edges_per_source(i) > limit) { if (final) { remaining(update) = i; @@ -1209,34 +1085,28 @@ class coarse_builder { while (remaining_count > 0) { // determine size for hashmap ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count; - getHashmapSizeAndCount(handle, n, remaining_count, remaining, - edges_per_source, hash_size, max_entries, + getHashmapSizeAndCount(handle, n, remaining_count, remaining, edges_per_source, hash_size, max_entries, mem_chunk_size, mem_chunk_count); // Create Uniform Initialized Memory Pool - KokkosKernels::Impl::PoolType pool_type = - KokkosKernels::Impl::ManyThread2OneChunk; + KokkosKernels::Impl::PoolType pool_type = KokkosKernels::Impl::ManyThread2OneChunk; if (is_host_space) { pool_type = KokkosKernels::Impl::OneThread2OneChunk; } - uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, - ORD_MAX, pool_type); + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, ORD_MAX, pool_type); - functorHashmapAccumulator hashmapAccumulator( - source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, - wgt_out, edges_per_source, memory_pool, hash_size, max_entries, - remaining, !scal_eq_ord); + functorHashmapAccumulator hashmapAccumulator(source_bucket_offset, dest_by_source, dest_by_source, + wgt_by_source, wgt_out, edges_per_source, memory_pool, hash_size, + max_entries, remaining, !scal_eq_ord); ordinal_t old_remaining_count = remaining_count; if (!is_host_space && max_entries >= 128) { - Kokkos::parallel_reduce("hashmap time", - dyn_team_policy_t(old_remaining_count, 1, 64), - hashmapAccumulator, remaining_count); + Kokkos::parallel_reduce("hashmap time", dyn_team_policy_t(old_remaining_count, 1, 64), hashmapAccumulator, + remaining_count); } else { - Kokkos::parallel_reduce("hashmap time", - dyn_policy_t(0, old_remaining_count), - hashmapAccumulator, remaining_count); + Kokkos::parallel_reduce("hashmap time", dyn_policy_t(0, old_remaining_count), hashmapAccumulator, + remaining_count); } if (remaining_count > 0) { @@ -1244,8 +1114,7 @@ class coarse_builder { Kokkos::parallel_scan( "move remaining vertices", policy_t(0, old_remaining_count), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = remaining(i); if (edges_per_source(u) >= max_entries) { if (final) { @@ -1260,10 +1129,7 @@ class coarse_builder { } gc_nedges = 0; Kokkos::parallel_reduce( - policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { - sum += edges_per_source(i); - }, + policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { sum += edges_per_source(i); }, gc_nedges); if (!scal_eq_ord && !is_host_space) { Kokkos::deep_copy(wgt_by_source, wgt_out); @@ -1279,10 +1145,8 @@ class coarse_builder { wgt_view_t wgts_out; ordinal_t workLength; - translationFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, - vtx_view_t _edges_per_source, - edge_view_t _source_bucket_offset, vtx_view_t _edges_out, - wgt_view_t _wgts_out) + translationFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, vtx_view_t _edges_per_source, + edge_view_t _source_bucket_offset, vtx_view_t _edges_out, wgt_view_t _wgts_out) : vcmap(_vcmap), g(_g), mapped_edges(_mapped_edges), @@ -1299,20 +1163,18 @@ class coarse_builder { ordinal_t u = vcmap.graph.entries(i); edge_offset_t start = g.graph.row_map(i); edge_offset_t end = g.graph.row_map(i + 1); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, start, end), - [&](const edge_offset_t idx) { - ordinal_t v = mapped_edges(idx); - if (u != v) { - // fix this, inefficient - edge_offset_t offset = Kokkos::atomic_fetch_add( - &edges_per_source(u), 1); - - offset += source_bucket_offset(u); - - edges_out(offset) = v; - wgts_out(offset) = g.values(idx); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, start, end), [&](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + if (u != v) { + // fix this, inefficient + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + + offset += source_bucket_offset(u); + + edges_out(offset) = v; + wgts_out(offset) = g.values(idx); + } + }); } KOKKOS_INLINE_FUNCTION @@ -1324,8 +1186,7 @@ class coarse_builder { ordinal_t v = mapped_edges(idx); if (u != v) { // fix this - edge_offset_t offset = - Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); offset += source_bucket_offset(u); @@ -1337,18 +1198,14 @@ class coarse_builder { }; // optimized for regular distribution low degree rows - static coarse_level_triple build_nonskew(coarsen_handle& handle, - const matrix_t g, - const matrix_t vcmap, - vtx_view_t mapped_edges, - vtx_view_t edges_per_source) { + static coarse_level_triple build_nonskew(coarsen_handle& handle, const matrix_t g, const matrix_t vcmap, + vtx_view_t mapped_edges, vtx_view_t edges_per_source) { ordinal_t n = g.numRows(); ordinal_t nc = vcmap.numCols(); edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); edge_offset_t gc_nedges = 0; - Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), - prefix_sum(edges_per_source, source_bucket_offset)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), prefix_sum(edges_per_source, source_bucket_offset)); Kokkos::deep_copy(edges_per_source, static_cast(0)); @@ -1360,39 +1217,30 @@ class coarse_builder { wgt_view_t wgt_by_source("wgt_by_source", nnz_pre_dedupe); // translates fine entries into coarse entries and writes into coarse rows - translationFunctor translateF(vcmap, g, mapped_edges, edges_per_source, - source_bucket_offset, dest_by_source, + translationFunctor translateF(vcmap, g, mapped_edges, edges_per_source, source_bucket_offset, dest_by_source, wgt_by_source); if (is_host_space) { - bool use_dyn = - should_use_dyn(n, g.graph.row_map, exec_space().concurrency()); + bool use_dyn = should_use_dyn(n, g.graph.row_map, exec_space().concurrency()); if (use_dyn) { - Kokkos::parallel_for("move edges to coarse matrix", dyn_policy_t(0, n), - translateF); + Kokkos::parallel_for("move edges to coarse matrix", dyn_policy_t(0, n), translateF); } else { - Kokkos::parallel_for("move edges to coarse matrix", policy_t(0, n), - translateF); + Kokkos::parallel_for("move edges to coarse matrix", policy_t(0, n), translateF); } } else { - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - n, g.nnz(), execSpaceEnum); + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(n, g.nnz(), execSpaceEnum); team_policy_t dummy(1, 1, vectorLength); int teamSize = dummy.team_size_max(translateF, Kokkos::ParallelForTag()); - Kokkos::parallel_for( - "move edges to coarse matrix", - team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), - translateF); + Kokkos::parallel_for("move edges to coarse matrix", + team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), translateF); } - deduplicate_graph(handle, nc, false, edges_per_source, dest_by_source, - wgt_by_source, source_bucket_offset, gc_nedges); + deduplicate_graph(handle, nc, false, edges_per_source, dest_by_source, wgt_by_source, source_bucket_offset, + gc_nedges); edge_view_t source_offsets("source_offsets", nc + 1); - Kokkos::parallel_scan("calc source offsets again", policy_t(0, nc), - prefix_sum(edges_per_source, source_offsets)); + Kokkos::parallel_scan("calc source offsets again", policy_t(0, nc), prefix_sum(edges_per_source, source_offsets)); edge_subview_t edge_total_subview = Kokkos::subview(source_offsets, nc); Kokkos::deep_copy(gc_nedges, edge_total_subview); @@ -1401,12 +1249,10 @@ class coarse_builder { wgt_view_t wgts("wgts", gc_nedges); if (is_host_space) { - bool use_dyn = - should_use_dyn(nc, source_offsets, exec_space().concurrency()); + bool use_dyn = should_use_dyn(nc, source_offsets, exec_space().concurrency()); if (use_dyn) { Kokkos::parallel_for( - "move deduped edges to new coarse matrix", dyn_policy_t(0, nc), - KOKKOS_LAMBDA(const ordinal_t& u) { + "move deduped edges to new coarse matrix", dyn_policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t& u) { edge_offset_t start_origin = source_bucket_offset(u); edge_offset_t start_dest = source_offsets(u); for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) { @@ -1416,8 +1262,7 @@ class coarse_builder { }); } else { Kokkos::parallel_for( - "move deduped edges to new coarse matrix", policy_t(0, nc), - KOKKOS_LAMBDA(const ordinal_t& u) { + "move deduped edges to new coarse matrix", policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t& u) { edge_offset_t start_origin = source_bucket_offset(u); edge_offset_t start_dest = source_offsets(u); for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) { @@ -1428,18 +1273,15 @@ class coarse_builder { } } else { Kokkos::parallel_for( - "move deduped edges to new coarse matrix", - team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { + "move deduped edges to new coarse matrix", team_policy_t(nc, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { ordinal_t u = thread.league_rank(); edge_offset_t start_origin = source_bucket_offset(u); edge_offset_t start_dest = source_offsets(u); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, edges_per_source(u)), - [=](const ordinal_t idx) { - dest_idx(start_dest + idx) = - dest_by_source(start_origin + idx); - wgts(start_dest + idx) = wgt_by_source(start_origin + idx); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, edges_per_source(u)), [=](const ordinal_t idx) { + dest_idx(start_dest + idx) = dest_by_source(start_origin + idx); + wgts(start_dest + idx) = wgt_by_source(start_origin + idx); + }); }); } @@ -1452,37 +1294,33 @@ class coarse_builder { } // forms the explicit matrix created by symmetrizing the implicit matrix - static matrix_t collapse_directed_to_undirected( - const ordinal_t nc, const vtx_view_t source_edge_counts, - const edge_view_t source_row_map, const vtx_view_t source_destinations, - const wgt_view_t source_wgts) { + static matrix_t collapse_directed_to_undirected(const ordinal_t nc, const vtx_view_t source_edge_counts, + const edge_view_t source_row_map, + const vtx_view_t source_destinations, const wgt_view_t source_wgts) { vtx_view_t coarse_degree("coarse degree", nc); Kokkos::deep_copy(coarse_degree, source_edge_counts); Kokkos::parallel_for( - "count directed edges owned by opposite endpoint", - team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { + "count directed edges owned by opposite endpoint", team_policy_t(nc, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { ordinal_t u = thread.league_rank(); edge_offset_t start = source_row_map(u); edge_offset_t end = start + source_edge_counts(u); - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = source_destinations(idx); - // increment other vertex - Kokkos::atomic_fetch_add(&coarse_degree(v), 1); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = source_destinations(idx); + // increment other vertex + Kokkos::atomic_fetch_add(&coarse_degree(v), 1); + }); }); edge_view_t target_row_map("target row map", nc + 1); - Kokkos::parallel_scan("calc target row map", policy_t(0, nc), - prefix_sum(coarse_degree, target_row_map)); + Kokkos::parallel_scan("calc target row map", policy_t(0, nc), prefix_sum(coarse_degree, target_row_map)); Kokkos::deep_copy(coarse_degree, static_cast(0)); - edge_offset_t coarse_edges_total = 0; - edge_subview_t coarse_edge_total_subview = - Kokkos::subview(target_row_map, nc); + edge_offset_t coarse_edges_total = 0; + edge_subview_t coarse_edge_total_subview = Kokkos::subview(target_row_map, nc); Kokkos::deep_copy(coarse_edges_total, coarse_edge_total_subview); vtx_view_t dest_idx("dest_idx", coarse_edges_total); @@ -1490,9 +1328,8 @@ class coarse_builder { Kokkos::parallel_for( "move edges into correct size matrix", team_policy_t(nc, Kokkos::AUTO), - functorCollapseDirectedToUndirected( - source_row_map, target_row_map, source_edge_counts, coarse_degree, - source_destinations, dest_idx, source_wgts, wgts)); + functorCollapseDirectedToUndirected(source_row_map, target_row_map, source_edge_counts, coarse_degree, + source_destinations, dest_idx, source_wgts, wgts)); graph_type gc_graph(dest_idx, target_row_map); matrix_t gc("gc", nc, wgts, gc_graph); @@ -1500,10 +1337,8 @@ class coarse_builder { } // optimized for skewed degree distributions - static coarse_level_triple build_skew(coarsen_handle& handle, - const matrix_t g, const matrix_t vcmap, - vtx_view_t mapped_edges, - vtx_view_t degree_initial) { + static coarse_level_triple build_skew(coarsen_handle& handle, const matrix_t g, const matrix_t vcmap, + vtx_view_t mapped_edges, vtx_view_t degree_initial) { ordinal_t n = g.numRows(); ordinal_t nc = vcmap.numCols(); edge_offset_t gc_nedges = 0; @@ -1513,8 +1348,7 @@ class coarse_builder { // recount with edges only belonging to coarse vertex of smaller degree // matrix becomes directed Kokkos::parallel_for( - "recount edges", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "recount edges", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); @@ -1531,15 +1365,13 @@ class coarse_builder { } }, nonLoopEdgesTotal); - Kokkos::single(Kokkos::PerTeam(thread), [=]() { - Kokkos::atomic_add(&edges_per_source(u), nonLoopEdgesTotal); - }); + Kokkos::single(Kokkos::PerTeam(thread), + [=]() { Kokkos::atomic_add(&edges_per_source(u), nonLoopEdgesTotal); }); }); edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); - Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), - prefix_sum(edges_per_source, source_bucket_offset)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), prefix_sum(edges_per_source, source_bucket_offset)); edge_subview_t sbo_subview = Kokkos::subview(source_bucket_offset, nc); edge_offset_t nnz_pre_dedupe = 0; Kokkos::deep_copy(nnz_pre_dedupe, sbo_subview); @@ -1548,38 +1380,33 @@ class coarse_builder { vtx_view_t dest_by_source("dest by source", nnz_pre_dedupe); wgt_view_t wgt_by_source("wgt by source", nnz_pre_dedupe); Kokkos::parallel_for( - "combine fine rows", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "combine fine rows", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); edge_offset_t end = g.graph.row_map(outer_idx + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = mapped_edges(idx); - bool degree_less = degree_initial(u) < degree_initial(v); - bool degree_equal = degree_initial(u) == degree_initial(v); - if (degree_less || (degree_equal && u < v)) { - edge_offset_t offset = - Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (degree_less || (degree_equal && u < v)) { + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); - offset += source_bucket_offset(u); + offset += source_bucket_offset(u); - dest_by_source(offset) = v; - wgt_by_source(offset) = g.values(idx); - } - }); + dest_by_source(offset) = v; + wgt_by_source(offset) = g.values(idx); + } + }); }); gc_nedges = 0; - deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, - wgt_by_source, source_bucket_offset, gc_nedges); + deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, wgt_by_source, source_bucket_offset, + gc_nedges); // form the final coarse graph, which requires symmetrizing the matrix - matrix_t gc = collapse_directed_to_undirected( - nc, edges_per_source, source_bucket_offset, dest_by_source, - wgt_by_source); + matrix_t gc = + collapse_directed_to_undirected(nc, edges_per_source, source_bucket_offset, dest_by_source, wgt_by_source); coarse_level_triple next_level; next_level.mtx = gc; @@ -1591,11 +1418,8 @@ class coarse_builder { // deduplicates within each fine row // combines fine rows into coarse rows // deduplicates within each coarse row - static coarse_level_triple build_high_duplicity(coarsen_handle& handle, - const matrix_t g, - const matrix_t vcmap, - vtx_view_t mapped_edges, - vtx_view_t degree_initial) { + static coarse_level_triple build_high_duplicity(coarsen_handle& handle, const matrix_t g, const matrix_t vcmap, + vtx_view_t mapped_edges, vtx_view_t degree_initial) { ordinal_t n = g.numRows(); ordinal_t nc = vcmap.numCols(); edge_offset_t gc_nedges = 0; @@ -1606,8 +1430,7 @@ class coarse_builder { // recount fine row sizes with edges only belonging to fine vertex of coarse // vertex of smaller degree matrix becomes directed Kokkos::parallel_for( - "recount edges", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "recount edges", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); @@ -1624,13 +1447,10 @@ class coarse_builder { } }, nonLoopEdgesTotal); - Kokkos::single(Kokkos::PerTeam(thread), [=]() { - dedupe_count(outer_idx) = nonLoopEdgesTotal; - }); + Kokkos::single(Kokkos::PerTeam(thread), [=]() { dedupe_count(outer_idx) = nonLoopEdgesTotal; }); }); - Kokkos::parallel_scan("calc source offsets", policy_t(0, n), - prefix_sum(dedupe_count, row_map_copy)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, n), prefix_sum(dedupe_count, row_map_copy)); // reset counters to 0 Kokkos::deep_copy(dedupe_count, static_cast(0)); @@ -1643,35 +1463,30 @@ class coarse_builder { // create a new directed version of the fine matrix Kokkos::parallel_for( - "move edges to new matrix", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "move edges to new matrix", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = g.graph.row_map(outer_idx); edge_offset_t end = g.graph.row_map(outer_idx + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = mapped_edges(idx); - bool degree_less = degree_initial(u) < degree_initial(v); - bool degree_equal = degree_initial(u) == degree_initial(v); - if (u != v && (degree_less || (degree_equal && u < v))) { - edge_offset_t offset = - Kokkos::atomic_fetch_add(&dedupe_count(outer_idx), 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (u != v && (degree_less || (degree_equal && u < v))) { + edge_offset_t offset = Kokkos::atomic_fetch_add(&dedupe_count(outer_idx), 1); - offset += row_map_copy(outer_idx); + offset += row_map_copy(outer_idx); - dest_fine(offset) = v; - wgt_fine(offset) = g.values(idx); - } - }); + dest_fine(offset) = v; + wgt_fine(offset) = g.values(idx); + } + }); }); //"delete" these views Kokkos::resize(mapped_edges, 0); // deduplicate coarse adjacencies within each fine row - deduplicate_graph(handle, n, true, dedupe_count, dest_fine, wgt_fine, - row_map_copy, gc_nedges); + deduplicate_graph(handle, n, true, dedupe_count, dest_fine, wgt_fine, row_map_copy, gc_nedges); edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); vtx_view_t edges_per_source("edges_per_source", nc); @@ -1681,46 +1496,40 @@ class coarse_builder { ordinal_t u = vcmap.graph.entries(i); Kokkos::atomic_fetch_add(&edges_per_source(u), dedupe_count(i)); }); - Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), - prefix_sum(edges_per_source, source_bucket_offset)); + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), prefix_sum(edges_per_source, source_bucket_offset)); Kokkos::deep_copy(edges_per_source, static_cast(0)); vtx_view_t dest_by_source("dest by source", gc_nedges); wgt_view_t wgt_by_source("wgt by source", gc_nedges); Kokkos::parallel_for( - "combine deduped fine rows", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "combine deduped fine rows", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t outer_idx = thread.league_rank(); ordinal_t u = vcmap.graph.entries(outer_idx); edge_offset_t start = row_map_copy(outer_idx); edge_offset_t end = start + dedupe_count(outer_idx); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(thread, start, end), - [=](const edge_offset_t idx) { - ordinal_t v = dest_fine(idx); - bool degree_less = degree_initial(u) < degree_initial(v); - bool degree_equal = degree_initial(u) == degree_initial(v); - if (degree_less || (degree_equal && u < v)) { - edge_offset_t offset = - Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), [=](const edge_offset_t idx) { + ordinal_t v = dest_fine(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (degree_less || (degree_equal && u < v)) { + edge_offset_t offset = Kokkos::atomic_fetch_add(&edges_per_source(u), 1); - offset += source_bucket_offset(u); + offset += source_bucket_offset(u); - dest_by_source(offset) = v; - wgt_by_source(offset) = wgt_fine(idx); - } - }); + dest_by_source(offset) = v; + wgt_by_source(offset) = wgt_fine(idx); + } + }); }); gc_nedges = 0; Kokkos::resize(dest_fine, 0); Kokkos::resize(wgt_fine, 0); - deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, - wgt_by_source, source_bucket_offset, gc_nedges); + deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, wgt_by_source, source_bucket_offset, + gc_nedges); // form the final coarse graph, which requires symmetrizing the matrix - matrix_t gc = collapse_directed_to_undirected( - nc, edges_per_source, source_bucket_offset, dest_by_source, - wgt_by_source); + matrix_t gc = + collapse_directed_to_undirected(nc, edges_per_source, source_bucket_offset, dest_by_source, wgt_by_source); coarse_level_triple next_level; next_level.mtx = gc; @@ -1735,9 +1544,8 @@ class coarse_builder { vtx_view_t c_vtx_w, f_vtx_w; ordinal_t workLength; - countingFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, - vtx_view_t _degree_initial, vtx_view_t _c_vtx_w, - vtx_view_t _f_vtx_w) + countingFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, vtx_view_t _degree_initial, + vtx_view_t _c_vtx_w, vtx_view_t _f_vtx_w) : vcmap(_vcmap), g(_g), mapped_edges(_mapped_edges), @@ -1788,8 +1596,7 @@ class coarse_builder { } }; - static coarse_level_triple build_coarse_graph(coarsen_handle& handle, - const coarse_level_triple level, + static coarse_level_triple build_coarse_graph(coarsen_handle& handle, const coarse_level_triple level, const matrix_t vcmap) { if (handle.b == Spgemm || handle.b == Spgemm_transpose_first) { return build_coarse_graph_spgemm(handle, level, vcmap); @@ -1807,24 +1614,18 @@ class coarse_builder { // count non-self loop edges per coarse vertex // also computes coarse vertex weights - countingFunctor countF(vcmap, g, mapped_edges, degree_initial, c_vtx_w, - f_vtx_w); + countingFunctor countF(vcmap, g, mapped_edges, degree_initial, c_vtx_w, f_vtx_w); if (is_host_space) { - Kokkos::parallel_for( - "count edges per coarse vertex (also compute coarse vertex weights)", - policy_t(0, n), countF); + Kokkos::parallel_for("count edges per coarse vertex (also compute coarse vertex weights)", policy_t(0, n), + countF); } else { - auto execSpaceEnum = - KokkosKernels::Impl::kk_get_exec_space_type(); - int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( - n, g.nnz(), execSpaceEnum); + auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(n, g.nnz(), execSpaceEnum); team_policy_t dummy(1, 1, vectorLength); int teamSize = dummy.team_size_max(countF, Kokkos::ParallelForTag()); // count edges per vertex - Kokkos::parallel_for( - "count edges per coarse vertex (also compute coarse vertex weights)", - team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), - countF); + Kokkos::parallel_for("count edges per coarse vertex (also compute coarse vertex weights)", + team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), countF); } // compute max row size and avg row size @@ -1842,10 +1643,7 @@ class coarse_builder { Kokkos::Max(max_unduped)); Kokkos::parallel_reduce( "find total", policy_t(0, nc), - KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { - sum += degree_initial(i); - }, - total_unduped); + KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { sum += degree_initial(i); }, total_unduped); ordinal_t avg_unduped = total_unduped / nc; coarse_level_triple next_level; @@ -1853,14 +1651,11 @@ class coarse_builder { // adjacency rows don't do optimizations if running on CPU (the default host // space) if (avg_unduped > (nc / 4) && !is_host_space) { - next_level = - build_high_duplicity(handle, g, vcmap, mapped_edges, degree_initial); - } else if (avg_unduped > 50 && (max_unduped / 10) > avg_unduped && - !is_host_space) { + next_level = build_high_duplicity(handle, g, vcmap, mapped_edges, degree_initial); + } else if (avg_unduped > 50 && (max_unduped / 10) > avg_unduped && !is_host_space) { next_level = build_skew(handle, g, vcmap, mapped_edges, degree_initial); } else { - next_level = - build_nonskew(handle, g, vcmap, mapped_edges, degree_initial); + next_level = build_nonskew(handle, g, vcmap, mapped_edges, degree_initial); } next_level.vtx_wgts = c_vtx_w; @@ -1870,9 +1665,7 @@ class coarse_builder { return next_level; } - static matrix_t generate_coarse_mapping(coarsen_handle& handle, - const matrix_t g, - bool uniform_weights) { + static matrix_t generate_coarse_mapping(coarsen_handle& handle, const matrix_t g, bool uniform_weights) { matrix_t interpolation_graph; int choice = 0; @@ -1883,14 +1676,9 @@ class coarse_builder { } switch (handle.h) { - case HECv1: - interpolation_graph = mapper_t::coarsen_HEC(g, uniform_weights); - break; + case HECv1: interpolation_graph = mapper_t::coarsen_HEC(g, uniform_weights); break; case Match: - case MtMetis: - interpolation_graph = - mapper_t::coarsen_match(g, uniform_weights, choice); - break; + case MtMetis: interpolation_graph = mapper_t::coarsen_match(g, uniform_weights, choice); break; case MIS2: interpolation_graph = mapper_t::coarsen_mis_2(g); break; case GOSHv2: interpolation_graph = mapper_t::coarsen_GOSH_v2(g); break; case GOSHv1: interpolation_graph = mapper_t::coarsen_GOSH(g); break; @@ -1902,9 +1690,7 @@ class coarse_builder { // this function can't return the generated list directly because of an NVCC // compiler bug caller must use the get_levels() method after calling this // function - static void generate_coarse_graphs(coarsen_handle& handle, - const matrix_t fine_g, - bool uniform_weights = false) { + static void generate_coarse_graphs(coarsen_handle& handle, const matrix_t fine_g, bool uniform_weights = false) { ordinal_t fine_n = fine_g.numRows(); std::list& levels = handle.results; levels.clear(); @@ -1920,15 +1706,13 @@ class coarse_builder { while (levels.rbegin()->mtx.numRows() > handle.coarse_vtx_cutoff) { coarse_level_triple current_level = *levels.rbegin(); - matrix_t interp_graph = generate_coarse_mapping( - handle, current_level.mtx, current_level.uniform_weights); + matrix_t interp_graph = generate_coarse_mapping(handle, current_level.mtx, current_level.uniform_weights); if (interp_graph.numCols() < handle.min_allowed_vtx) { break; } - coarse_level_triple next_level = - build_coarse_graph(handle, current_level, interp_graph); + coarse_level_triple next_level = build_coarse_graph(handle, current_level, interp_graph); levels.push_back(next_level); diff --git a/graph/src/KokkosGraph_CoarsenHeuristics.hpp b/graph/src/KokkosGraph_CoarsenHeuristics.hpp index 1694905167..f136882d89 100644 --- a/graph/src/KokkosGraph_CoarsenHeuristics.hpp +++ b/graph/src/KokkosGraph_CoarsenHeuristics.hpp @@ -74,8 +74,7 @@ class coarsen_heuristics { int t_buckets = 2 * n; vtx_view_t buckets("buckets", t_buckets); Kokkos::parallel_for( - "init buckets", policy_t(0, t_buckets), - KOKKOS_LAMBDA(ordinal_t i) { buckets(i) = ORD_MAX; }); + "init buckets", policy_t(0, t_buckets), KOKKOS_LAMBDA(ordinal_t i) { buckets(i) = ORD_MAX; }); uint64_t max = std::numeric_limits::max(); uint64_t bucket_size = max / t_buckets; @@ -87,8 +86,7 @@ class coarsen_heuristics { if (bucket >= t_buckets) bucket -= t_buckets; if (buckets(bucket) == ORD_MAX) { // attempt to insert into bucket - if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket), - ORD_MAX, i)) { + if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket), ORD_MAX, i)) { break; } } @@ -113,9 +111,9 @@ class coarsen_heuristics { // create a mapping when some vertices are already mapped // hn is a list of vertices such that vertex i wants to aggregate with vertex // hn(i) - static ordinal_t parallel_map_construct_prefilled( - vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm, - const vtx_view_t hn, Kokkos::View nvertices_coarse) { + static ordinal_t parallel_map_construct_prefilled(vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm, + const vtx_view_t hn, + Kokkos::View nvertices_coarse) { vtx_view_t match("match", n); Kokkos::parallel_for( policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { @@ -142,14 +140,11 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, - v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong( - &match(v), ORD_MAX, u)) { - ordinal_t cv = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(u) = cv; - vcmap(v) = cv; + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; } else { if (vcmap(v) != ORD_MAX) { vcmap(u) = vcmap(v); @@ -183,10 +178,8 @@ class coarsen_heuristics { // hn is a list of vertices such that vertex i wants to aggregate with vertex // hn(i) - static ordinal_t parallel_map_construct(vtx_view_t vcmap, const ordinal_t n, - const vtx_view_t vperm, - const vtx_view_t hn, - const vtx_view_t ordering) { + static ordinal_t parallel_map_construct(vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm, + const vtx_view_t hn, const vtx_view_t ordering) { vtx_view_t match("match", n); Kokkos::parallel_for( policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { match(i) = ORD_MAX; }); @@ -208,10 +201,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, - v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong( - &match(v), ORD_MAX, u)) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { ordinal_t cv = u; if (v < u) { cv = v; @@ -232,9 +223,7 @@ class coarsen_heuristics { // add the ones that failed to be reprocessed next round // maybe count these then create next_perm to save memory? Kokkos::parallel_scan( - policy_t(0, perm_length), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + policy_t(0, perm_length), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = curr_perm(i); if (vcmap(u) == ORD_MAX) { if (final) { @@ -252,8 +241,7 @@ class coarsen_heuristics { curr_perm = next_perm; } Kokkos::parallel_scan( - "assign aggregates", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t u, ordinal_t& update, const bool final) { + "assign aggregates", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t u, ordinal_t& update, const bool final) { if (vcmap(u) == u) { if (final) { vcmap(u) = update; @@ -325,8 +313,7 @@ class coarsen_heuristics { edge_offset_t max_degree = tuple_degree(u); ordinal_t max_idx = tuple_idx(u); - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); bool is_max = false; if (tuple_state(v) > max_state) { @@ -375,8 +362,7 @@ class coarsen_heuristics { } // check if at least one of neighbors are in the IS or will be // placed into the IS - else if (tuple_state(u) == 1 || - tuple_idx(tuple_idx(u)) == tuple_idx(u)) { + else if (tuple_state(u) == 1 || tuple_idx(tuple_idx(u)) == tuple_idx(u)) { state(u) = -1; } } @@ -389,8 +375,7 @@ class coarsen_heuristics { vtx_view_t next_unassigned("next unassigned", next_unassigned_total); Kokkos::parallel_scan( "create next unassigned", policy_t(0, unassigned_total), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = unassigned(i); if (state(u) == 0) { if (final) { @@ -408,12 +393,11 @@ class coarsen_heuristics { static matrix_t coarsen_mis_2(const matrix_t& g) { ordinal_t n = g.numRows(); - typename matrix_t::staticcrsgraph_type::entries_type::non_const_value_type - nc = 0; - vtx_view_t vcmap = KokkosGraph::graph_mis2_aggregate< - Device, typename matrix_t::staticcrsgraph_type::row_map_type, - typename matrix_t::staticcrsgraph_type::entries_type, vtx_view_t>( - g.graph.row_map, g.graph.entries, nc); + typename matrix_t::staticcrsgraph_type::entries_type::non_const_value_type nc = 0; + vtx_view_t vcmap = + KokkosGraph::graph_mis2_aggregate( + g.graph.row_map, g.graph.entries, nc); edge_view_t row_map("interpolate row map", n + 1); @@ -461,11 +445,9 @@ class coarsen_heuristics { if (colors(i) != first_color) { // could use a thread team here edge_offset_t max_degree = 0; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { - ordinal_t v = g.graph.entries(j); - edge_offset_t degree = - g.graph.row_map(v + 1) - g.graph.row_map(v); + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + edge_offset_t degree = g.graph.row_map(v + 1) - g.graph.row_map(v); if (colors(v) == first_color && degree > max_degree) { max_degree = degree; vcmap(i) = vcmap(v); @@ -524,8 +506,7 @@ class coarsen_heuristics { if (vcmap(i) == ORD_MAX) { ordinal_t argmax = ORD_MAX; scalar_t max_w = 0; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { ordinal_t v = g.graph.entries(j); ordinal_t wgt = g.values(j); if (vcmap(v) != ORD_MAX) { @@ -547,11 +528,9 @@ class coarsen_heuristics { if (vcmap(i) == ORD_MAX) { ordinal_t argmax = ORD_MAX; edge_offset_t max_d = 0; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { - ordinal_t v = g.graph.entries(j); - edge_offset_t degree = - g.graph.row_map(v + 1) - g.graph.row_map(v); + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + edge_offset_t degree = g.graph.row_map(v + 1) - g.graph.row_map(v); if (vcmap(v) != ORD_MAX) { if (degree >= max_d) { max_d = degree; @@ -569,8 +548,7 @@ class coarsen_heuristics { Kokkos::parallel_for( policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { if (vcmap(i) != ORD_MAX) { - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { ordinal_t v = g.graph.entries(j); if (vcmap(v) == ORD_MAX) { vcmap(v) = vcmap(i); @@ -593,8 +571,7 @@ class coarsen_heuristics { vtx_view_t remaining("remaining vtx", remaining_total); Kokkos::parallel_scan( - "count remaining", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { + "count remaining", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { if (vcmap(i) == ORD_MAX) { if (final) { remaining(update) = i; @@ -608,8 +585,7 @@ class coarsen_heuristics { pool_t rand_pool(std::time(nullptr)); Kokkos::parallel_for( - "fill hn", policy_t(0, remaining_total), - KOKKOS_LAMBDA(ordinal_t r_idx) { + "fill hn", policy_t(0, remaining_total), KOKKOS_LAMBDA(ordinal_t r_idx) { // select heaviest neighbor with ties randomly broken ordinal_t i = remaining(r_idx); ordinal_t hn_i = ORD_MAX; @@ -639,8 +615,7 @@ class coarsen_heuristics { hn(i) = hn_i; }); - ordinal_t nc = - parallel_map_construct_prefilled(vcmap, n, remaining, hn, nvc); + ordinal_t nc = parallel_map_construct_prefilled(vcmap, n, remaining, hn, nvc); Kokkos::deep_copy(nc, nvc); edge_view_t row_map("interpolate row map", n + 1); @@ -671,8 +646,7 @@ class coarsen_heuristics { vtx_view_t vcmap("vcmap", n); Kokkos::parallel_for( - "initialize vcmap", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); + "initialize vcmap", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); pool_t rand_pool(std::time(nullptr)); @@ -680,8 +654,7 @@ class coarsen_heuristics { vtx_view_t reverse_map("reversed", n); Kokkos::parallel_for( - "construct reverse map", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); + "construct reverse map", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); if (uniform_weights) { // all weights equal at this level so choose heaviest edge randomly @@ -690,9 +663,8 @@ class coarsen_heuristics { gen_t generator = rand_pool.get_state(); ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); if (adj_size > 0) { - ordinal_t offset = - g.graph.row_map(i) + (generator.urand64() % adj_size); - hn(i) = g.graph.entries(offset); + ordinal_t offset = g.graph.row_map(i) + (generator.urand64() % adj_size); + hn(i) = g.graph.entries(offset); } else { hn(i) = generator.urand64() % n; } @@ -700,18 +672,15 @@ class coarsen_heuristics { }); } else { Kokkos::parallel_for( - "Heaviest HN", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "Heaviest HN", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t i = thread.league_rank(); ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); if (adj_size > 0) { edge_offset_t end = g.graph.row_map(i + 1); - typename Kokkos::MaxLoc::value_type argmax{}; + typename Kokkos::MaxLoc::value_type argmax{}; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(thread, g.graph.row_map(i), end), - [=](const edge_offset_t idx, - Kokkos::ValLocScalar& local) { + [=](const edge_offset_t idx, Kokkos::ValLocScalar& local) { scalar_t wgt = g.values(idx); if (wgt >= local.val) { local.val = wgt; @@ -773,10 +742,8 @@ class coarsen_heuristics { Kokkos::View hashes; ordinal_t unmapped_total; Kokkos::View nvertices_coarse; - MatchByHashSorted(vtx_view_t _vcmap, vtx_view_t _unmapped, - Kokkos::View _hashes, - ordinal_t _unmapped_total, - Kokkos::View _nvertices_coarse) + MatchByHashSorted(vtx_view_t _vcmap, vtx_view_t _unmapped, Kokkos::View _hashes, + ordinal_t _unmapped_total, Kokkos::View _nvertices_coarse) : vcmap(_vcmap), unmapped(_unmapped), hashes(_hashes), @@ -784,8 +751,7 @@ class coarsen_heuristics { nvertices_coarse(_nvertices_coarse) {} KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_t i, ordinal_t& update, - const bool final) const { + void operator()(const ordinal_t i, ordinal_t& update, const bool final) const { ordinal_t u = unmapped(i); ordinal_t tentative = 0; if (i == 0) { @@ -823,8 +789,7 @@ class coarsen_heuristics { } }; - static matrix_t coarsen_match(const matrix_t& g, bool uniform_weights, - int match_choice) { + static matrix_t coarsen_match(const matrix_t& g, bool uniform_weights, int match_choice) { ordinal_t n = g.numRows(); vtx_view_t hn("heavies", n); @@ -832,8 +797,7 @@ class coarsen_heuristics { vtx_view_t vcmap("vcmap", n); Kokkos::parallel_for( - "initialize vcmap", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); + "initialize vcmap", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); rand_view_t randoms("randoms", n); @@ -843,8 +807,7 @@ class coarsen_heuristics { vtx_view_t reverse_map("reversed", n); Kokkos::parallel_for( - "construct reverse map", policy_t(0, n), - KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); + "construct reverse map", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); if (uniform_weights) { // all weights equal at this level so choose heaviest edge randomly @@ -852,9 +815,8 @@ class coarsen_heuristics { "Random HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { gen_t generator = rand_pool.get_state(); ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); - ordinal_t offset = - g.graph.row_map(i) + (generator.urand64() % adj_size); - hn(i) = g.graph.entries(offset); + ordinal_t offset = g.graph.row_map(i) + (generator.urand64() % adj_size); + hn(i) = g.graph.entries(offset); rand_pool.free_state(generator); }); } else { @@ -863,11 +825,9 @@ class coarsen_heuristics { ordinal_t hn_i = g.graph.entries(g.graph.row_map(i)); scalar_t max_ewt = g.values(g.graph.row_map(i)); - edge_offset_t end_offset = - g.graph.row_map(i + 1); // +g.edges_per_source[i]; + edge_offset_t end_offset = g.graph.row_map(i + 1); // +g.edges_per_source[i]; - for (edge_offset_t j = g.graph.row_map(i) + 1; j < end_offset; - j++) { + for (edge_offset_t j = g.graph.row_map(i) + 1; j < end_offset; j++) { if (max_ewt < g.values(j)) { max_ewt = g.values(j); hn_i = g.graph.entries(j); @@ -899,15 +859,12 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, - v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong( - &match(v), ORD_MAX, u)) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { // u == v avoids problems if there is a self-loop edge - ordinal_t cv = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(u) = cv; - vcmap(v) = cv; + ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; } else { match(u) = ORD_MAX; } @@ -930,8 +887,7 @@ class coarsen_heuristics { // check if any are unmatched! so instead of randomly choosing a // heaviest edge, we instead use the reverse permutation order // as the weight - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); // v must be unmatched to be considered if (vcmap(v) == ORD_MAX) { @@ -944,8 +900,7 @@ class coarsen_heuristics { } } else { scalar_t max_ewt = 0; - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); // v must be unmatched to be considered if (vcmap(v) == ORD_MAX) { @@ -959,8 +914,7 @@ class coarsen_heuristics { } if (h != ORD_MAX) { - ordinal_t add_next = - Kokkos::atomic_fetch_add(&next_length(), 1); + ordinal_t add_next = Kokkos::atomic_fetch_add(&next_length(), 1); next_perm(add_next) = u; hn(u) = h; } @@ -973,9 +927,8 @@ class coarsen_heuristics { } if (match_choice == 1) { - ordinal_t unmapped = countInf(vcmap); - double unmappedRatio = - static_cast(unmapped) / static_cast(n); + ordinal_t unmapped = countInf(vcmap); + double unmappedRatio = static_cast(unmapped) / static_cast(n); // leaf matches if (unmappedRatio > 0.25) { @@ -983,8 +936,7 @@ class coarsen_heuristics { policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { if (vcmap(u) != ORD_MAX) { ordinal_t lastLeaf = ORD_MAX; - for (edge_offset_t j = g.graph.row_map(u); - j < g.graph.row_map(u + 1); j++) { + for (edge_offset_t j = g.graph.row_map(u); j < g.graph.row_map(u + 1); j++) { ordinal_t v = g.graph.entries(j); // v must be unmatched to be considered if (vcmap(v) == ORD_MAX) { @@ -993,10 +945,9 @@ class coarsen_heuristics { if (lastLeaf == ORD_MAX) { lastLeaf = v; } else { - vcmap(lastLeaf) = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(v) = vcmap(lastLeaf); - lastLeaf = ORD_MAX; + vcmap(lastLeaf) = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(v) = vcmap(lastLeaf); + lastLeaf = ORD_MAX; } } } @@ -1017,20 +968,16 @@ class coarsen_heuristics { hasher_t hasher; // compute digests of adjacency lists Kokkos::parallel_for( - "create digests", team_policy_t(n, Kokkos::AUTO), - KOKKOS_LAMBDA(const member& thread) { + "create digests", team_policy_t(n, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { ordinal_t u = thread.league_rank(); if (vcmap(u) == ORD_MAX) { uint32_t hash = 0; Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(thread, g.graph.row_map(u), - g.graph.row_map(u + 1)), - [=](const edge_offset_t j, uint32_t& thread_sum) { - thread_sum += hasher(g.graph.entries(j)); - }, + Kokkos::TeamThreadRange(thread, g.graph.row_map(u), g.graph.row_map(u + 1)), + [=](const edge_offset_t j, uint32_t& thread_sum) { thread_sum += hasher(g.graph.entries(j)); }, hash); Kokkos::single(Kokkos::PerTeam(thread), [=]() { - ordinal_t idx = Kokkos::atomic_fetch_add(&unmappedIdx(), 1); + ordinal_t idx = Kokkos::atomic_fetch_add(&unmappedIdx(), 1); unmappedVtx(idx) = u; hashes(idx) = hash; }); @@ -1040,17 +987,13 @@ class coarsen_heuristics { typedef Kokkos::BinOp1D > BinOp; BinOp bin_op(unmapped, 0, max); // VERY important that final parameter is true - Kokkos::BinSort, BinOp, exec_space, - ordinal_t> - sorter(hashes, bin_op, true); + Kokkos::BinSort, BinOp, exec_space, ordinal_t> sorter(hashes, bin_op, true); sorter.create_permute_vector(); sorter.template sort >(hashes); sorter.template sort(unmappedVtx); - MatchByHashSorted matchTwinFunctor(vcmap, unmappedVtx, hashes, unmapped, - nvertices_coarse); - Kokkos::parallel_scan("match twins", policy_t(0, unmapped), - matchTwinFunctor); + MatchByHashSorted matchTwinFunctor(vcmap, unmappedVtx, hashes, unmapped, nvertices_coarse); + Kokkos::parallel_scan("match twins", policy_t(0, unmapped), matchTwinFunctor); } unmapped = countInf(vcmap); @@ -1061,9 +1004,7 @@ class coarsen_heuristics { // get possibly mappable vertices of unmapped vtx_view_t mappableVtx("mappable vertices", unmapped); Kokkos::parallel_scan( - "get unmapped", policy_t(0, n), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + "get unmapped", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { if (vcmap(i) == ORD_MAX) { if (final) { mappableVtx(update) = i; @@ -1076,8 +1017,7 @@ class coarsen_heuristics { ordinal_t mappable_count = unmapped; do { Kokkos::parallel_for( - "reset hn", policy_t(0, mappable_count), - KOKKOS_LAMBDA(ordinal_t i) { + "reset hn", policy_t(0, mappable_count), KOKKOS_LAMBDA(ordinal_t i) { ordinal_t u = mappableVtx(i); hn(u) = ORD_MAX; }); @@ -1087,8 +1027,7 @@ class coarsen_heuristics { "assign relatives", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { if (vcmap(i) != ORD_MAX) { ordinal_t last_free = ORD_MAX; - for (edge_offset_t j = g.graph.row_map(i); - j < g.graph.row_map(i + 1); j++) { + for (edge_offset_t j = g.graph.row_map(i); j < g.graph.row_map(i + 1); j++) { ordinal_t v = g.graph.entries(j); if (vcmap(v) == ORD_MAX) { if (last_free != ORD_MAX) { @@ -1123,8 +1062,7 @@ class coarsen_heuristics { Kokkos::parallel_scan( "get next mappable", policy_t(0, old_mappable), - KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, - const bool final) { + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { ordinal_t u = mappableVtx(i); if (hn(u) != ORD_MAX) { if (final) { @@ -1146,14 +1084,11 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), - ORD_MAX, v)) { - if (Kokkos::atomic_compare_exchange_strong(&match(v), - ORD_MAX, u)) { - ordinal_t cv = - Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); - vcmap(u) = cv; - vcmap(v) = cv; + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { + if (Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; } else { match(u) = ORD_MAX; } diff --git a/graph/src/KokkosGraph_Distance1Color.hpp b/graph/src/KokkosGraph_Distance1Color.hpp index 784b687957..86bb28bab0 100644 --- a/graph/src/KokkosGraph_Distance1Color.hpp +++ b/graph/src/KokkosGraph_Distance1Color.hpp @@ -24,13 +24,10 @@ namespace KokkosGraph { namespace Experimental { -template -void graph_color_symbolic(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t /* num_cols */, - lno_row_view_t_ row_map, lno_nnz_view_t_ entries, - bool /* is_symmetric */ = true) { +template +void graph_color_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t /* num_cols */, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, bool /* is_symmetric */ = true) { typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; @@ -40,37 +37,29 @@ void graph_color_symbolic(KernelHandle *handle, typedef typename KernelHandle::const_nnz_lno_t c_lno_t; typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; - typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< - c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle ConstKernelHandle; ConstKernelHandle tmp_handle(*handle); typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, DeviceType, + Kokkos::MemoryTraits > Internal_rowmap; typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, DeviceType, + Kokkos::MemoryTraits > Internal_entries; - KokkosGraph::Impl:: - COLOR_D1::color_d1( - &tmp_handle, num_rows, - Internal_rowmap(row_map.data(), row_map.extent(0)), - Internal_entries(entries.data(), entries.extent(0))); + KokkosGraph::Impl::COLOR_D1::color_d1( + &tmp_handle, num_rows, Internal_rowmap(row_map.data(), row_map.extent(0)), + Internal_entries(entries.data(), entries.extent(0))); } -template -void graph_color(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_cols, - lno_row_view_t_ row_map, lno_nnz_view_t_ entries, +template +void graph_color(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_symmetric = true) { - graph_color_symbolic(handle, num_rows, num_cols, row_map, entries, - is_symmetric); + graph_color_symbolic(handle, num_rows, num_cols, row_map, entries, is_symmetric); } } // end namespace Experimental diff --git a/graph/src/KokkosGraph_Distance1ColorHandle.hpp b/graph/src/KokkosGraph_Distance1ColorHandle.hpp index 1b2f981945..1eefd07c4d 100644 --- a/graph/src/KokkosGraph_Distance1ColorHandle.hpp +++ b/graph/src/KokkosGraph_Distance1ColorHandle.hpp @@ -22,7 +22,7 @@ #ifndef _GRAPHCOLORHANDLE_HPP #define _GRAPHCOLORHANDLE_HPP -//#define VERBOSE +// #define VERBOSE namespace KokkosGraph { enum ColoringAlgorithm { @@ -45,8 +45,7 @@ enum ColoringType { Distance1, Distance2 }; template + class ExecutionSpace, class TemporaryMemorySpace, class PersistentMemorySpace> class GraphColoringHandle { public: typedef ExecutionSpace HandleExecSpace; @@ -62,8 +61,7 @@ class GraphColoringHandle { typedef typename std::remove_const::type color_t; typedef const color_t const_color_t; - typedef typename Kokkos::View - color_view_t; + typedef typename Kokkos::View color_view_t; typedef typename color_view_t::array_layout color_view_array_layout; typedef typename color_view_t::device_type color_view_device_t; @@ -71,20 +69,15 @@ class GraphColoringHandle { typedef typename color_view_t::HostMirror color_host_view_t; // Host view // type - typedef typename Kokkos::View - size_type_temp_work_view_t; - typedef typename Kokkos::View - size_type_persistent_work_view_t; + typedef typename Kokkos::View size_type_temp_work_view_t; + typedef typename Kokkos::View size_type_persistent_work_view_t; - typedef typename size_type_persistent_work_view_t::HostMirror - size_type_persistent_work_host_view_t; // Host view type + typedef + typename size_type_persistent_work_view_t::HostMirror size_type_persistent_work_host_view_t; // Host view type - typedef typename Kokkos::View - nnz_lno_temp_work_view_t; - typedef typename Kokkos::View - nnz_lno_persistent_work_view_t; - typedef typename nnz_lno_persistent_work_view_t::HostMirror - nnz_lno_persistent_work_host_view_t; // Host view type + typedef typename Kokkos::View nnz_lno_temp_work_view_t; + typedef typename Kokkos::View nnz_lno_persistent_work_view_t; + typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; // Host view type typedef Kokkos::TeamPolicy team_policy_t; typedef typename team_policy_t::member_type team_member_t; @@ -95,9 +88,9 @@ class GraphColoringHandle { ColoringType GraphColoringType; // Parameters ColoringAlgorithm coloring_algorithm_type; // VB, VBBIT, VBCS, VBD or EB. - ConflictList conflict_list_type; // whether to use a conflict list or not, - // and if using it wheter to create it with - // atomic or parallel prefix sum. + ConflictList conflict_list_type; // whether to use a conflict list or not, + // and if using it wheter to create it with + // atomic or parallel prefix sum. double min_reduction_for_conflictlist; // if used pps is selected to create conflict list, what min percantage should @@ -116,23 +109,23 @@ class GraphColoringHandle { bool vb_edge_filtering; // whether to do edge filtering or not in vertex // based algorithms. Swaps on the ad error. - int vb_chunk_size; // the (minimum) size of the consecutive works that a - // thread will be assigned to. + int vb_chunk_size; // the (minimum) size of the consecutive works that a + // thread will be assigned to. int max_number_of_iterations; // maximum allowed number of phases int eb_num_initial_colors; // the number of colors to assign at the beginning // of the edge-based algorithm // STATISTICS - double overall_coloring_time; // the overall time that it took to color the - // graph. In the case of the iterative calls. + double overall_coloring_time; // the overall time that it took to color the + // graph. In the case of the iterative calls. double overall_coloring_time_phase1; // double overall_coloring_time_phase2; // double overall_coloring_time_phase3; // Some timer accumulators for internal // phases. double overall_coloring_time_phase4; // double overall_coloring_time_phase5; // - double coloring_time; // the time that it took to color the graph + double coloring_time; // the time that it took to color the graph int num_phases; // @@ -189,9 +182,7 @@ class GraphColoringHandle { * KokkosKernels::Experimental::Graph::Distance1 or * KokkosKernels::Experimental::Graph::Distance2 */ - void set_coloring_type(const ColoringType &col_type) { - this->GraphColoringType = col_type; - } + void set_coloring_type(const ColoringType &col_type) { this->GraphColoringType = col_type; } /** \brief Gets the graph coloring type. Whether it is distance-1 or * distance-2 coloring. returns Coloring Type: @@ -206,8 +197,7 @@ class GraphColoringHandle { * COLORING_VBCS, COLORING_EB \param set_default_parameters: whether or not to * reset the default parameters for the given algorithm. */ - void set_algorithm(const ColoringAlgorithm &col_algo, - bool set_default_parameters = true) { + void set_algorithm(const ColoringAlgorithm &col_algo, bool set_default_parameters = true) { if (col_algo == COLORING_DEFAULT) { this->choose_default_algorithm(); } else { @@ -228,27 +218,23 @@ class GraphColoringHandle { if (exec == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_SERIAL; #ifdef VERBOSE - std::cout - << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; #endif } else if (exec == KokkosKernels::Impl::Exec_SYCL) { // FIXME SYCL: Do not use EB this->coloring_algorithm_type = COLORING_VBBIT; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; #endif } else if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { this->coloring_algorithm_type = COLORING_EB; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_EB\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; #endif } else { this->coloring_algorithm_type = COLORING_VBBIT; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; #endif } } @@ -261,10 +247,7 @@ class GraphColoringHandle { v3 lower_xadj_counts; CountLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_) - : nv(nv_), - xadj(xadj_), - adj(adj_), - lower_xadj_counts(lower_xadj_counts_) {} + : nv(nv_), xadj(xadj_), adj(adj_), lower_xadj_counts(lower_xadj_counts_) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t &i, size_type &new_num_edge) const { @@ -290,18 +273,12 @@ class GraphColoringHandle { v2 adj; v3 lower_xadj_counts; - CountLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, - v3 lower_xadj_counts_) - : nv(nv_), - xadj(xadj_), - adj(adj_), - lower_xadj_counts(lower_xadj_counts_) {} + CountLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_) + : nv(nv_), xadj(xadj_), adj(adj_), lower_xadj_counts(lower_xadj_counts_) {} KOKKOS_INLINE_FUNCTION - void operator()( - const team_member_t &teamMember /*, row_lno_t &new_num_edge*/) const { - nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + void operator()(const team_member_t &teamMember /*, row_lno_t &new_num_edge*/) const { + nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= nv) { return; } @@ -322,8 +299,7 @@ class GraphColoringHandle { }, new_edge_count); - Kokkos::single(Kokkos::PerThread(teamMember), - [&]() { lower_xadj_counts(ii + 1) = new_edge_count; }); + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { lower_xadj_counts(ii + 1) = new_edge_count; }); } }; @@ -336,8 +312,7 @@ class GraphColoringHandle { v4 lower_srcs; v4 lower_dsts; - FillLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, - v3 lower_xadj_counts_, v4 lower_srcs_, v4 lower_dsts_) + FillLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_, v4 lower_srcs_, v4 lower_dsts_) : nv(nv_), xadj(xadj_), adj(adj_), @@ -347,12 +322,9 @@ class GraphColoringHandle { KOKKOS_INLINE_FUNCTION void operator()(const team_member_t &teamMember) const { - typedef - typename std::remove_reference::type - atomic_incr_type; + typedef typename std::remove_reference::type atomic_incr_type; - nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + - teamMember.team_rank(); + nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank(); if (ii >= nv) { return; } @@ -360,18 +332,15 @@ class GraphColoringHandle { size_type xadj_begin = xadj(ii); size_type xadj_end = xadj(ii + 1); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, xadj_end - xadj_begin), - [&](size_type i) { - size_type adjind = i + xadj_begin; - nnz_lno_t n = adj[adjind]; - if (ii < n && n < nv) { - size_type position = Kokkos::atomic_fetch_add( - &(lower_xadj_counts(ii)), atomic_incr_type(1)); - lower_srcs(position) = ii; - lower_dsts(position) = n; - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, xadj_end - xadj_begin), [&](size_type i) { + size_type adjind = i + xadj_begin; + nnz_lno_t n = adj[adjind]; + if (ii < n && n < nv) { + size_type position = Kokkos::atomic_fetch_add(&(lower_xadj_counts(ii)), atomic_incr_type(1)); + lower_srcs(position) = ii; + lower_dsts(position) = n; + } + }); } }; @@ -384,8 +353,7 @@ class GraphColoringHandle { v4 lower_srcs; v4 lower_dsts; - FillLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_, - v4 lower_srcs_, v4 lower_dsts_) + FillLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_, v4 lower_srcs_, v4 lower_dsts_) : nv(nv_), xadj(xadj_), adj(adj_), @@ -410,21 +378,18 @@ class GraphColoringHandle { }; template - void symmetrize_and_calculate_lower_diagonal_edge_list( - nnz_lno_t nv, row_index_view_type xadj, nonzero_view_type adj) { - KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list< - row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t, - ExecutionSpace>(nv, xadj, adj, lower_triangle_src, lower_triangle_dst); + void symmetrize_and_calculate_lower_diagonal_edge_list(nnz_lno_t nv, row_index_view_type xadj, + nonzero_view_type adj) { + KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list( + nv, xadj, adj, lower_triangle_src, lower_triangle_dst); size_of_edge_list = lower_triangle_src.extent(0); } template - void get_lower_diagonal_edge_list(nnz_lno_t nv, size_type ne, - row_index_view_type xadj, - nonzero_view_type adj, - size_type &num_out_edges, - nnz_lno_persistent_work_view_t &src, + void get_lower_diagonal_edge_list(nnz_lno_t nv, size_type ne, row_index_view_type xadj, nonzero_view_type adj, + size_type &num_out_edges, nnz_lno_persistent_work_view_t &src, nnz_lno_persistent_work_view_t &dst) { if (size_of_edge_list > 0) { num_out_edges = size_of_edge_list; @@ -441,26 +406,20 @@ class GraphColoringHandle { int teamSizeMax = 0; int vector_size = 0; - CountLowerTriangleTeam - clt(nv, xadj, adj, lower_count); + CountLowerTriangleTeam clt(nv, xadj, adj, + lower_count); - KokkosKernels::Impl::get_suggested_vector_size( - vector_size, nv, ne); + KokkosKernels::Impl::get_suggested_vector_size(vector_size, nv, ne); - teamSizeMax = - KokkosKernels::Impl::get_suggested_team_size( - clt, vector_size); + teamSizeMax = KokkosKernels::Impl::get_suggested_team_size(clt, vector_size); Kokkos::parallel_for("KokkosGraph::CountLowerTriangleTeam", - team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, - teamSizeMax, vector_size), + team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size), clt //, new_num_edge ); - KokkosKernels::Impl::inclusive_parallel_prefix_sum< - size_type_temp_work_view_t, ExecutionSpace>(nv + 1, lower_count); + KokkosKernels::Impl::inclusive_parallel_prefix_sum(nv + 1, + lower_count); // Kokkos::parallel_scan (my_exec_space(0, nv + 1), // PPS(lower_count)); ExecutionSpace().fence(); @@ -469,20 +428,15 @@ class GraphColoringHandle { Kokkos::deep_copy(hlower, lower_total_count); new_num_edge = hlower(); - nnz_lno_persistent_work_view_t half_src( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), - new_num_edge); - nnz_lno_persistent_work_view_t half_dst( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), - new_num_edge); + nnz_lno_persistent_work_view_t half_src(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), + new_num_edge); + nnz_lno_persistent_work_view_t half_dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), + new_num_edge); Kokkos::parallel_for( "KokkosGraph::FillLowerTriangleTeam", - team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, - vector_size), - FillLowerTriangleTeam( - nv, xadj, adj, lower_count, half_src, half_dst)); + team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size), + FillLowerTriangleTeam(nv, xadj, adj, lower_count, half_src, half_dst)); src = lower_triangle_src = half_src; dst = lower_triangle_dst = half_dst; @@ -491,30 +445,25 @@ class GraphColoringHandle { if (nv > 0) { Kokkos::parallel_reduce( "KokkosGraph::CountLowerTriangleTeam", my_exec_space(0, nv), - CountLowerTriangle(nv, xadj, adj, - lower_count), + CountLowerTriangle(nv, xadj, adj, + lower_count), new_num_edge); } // Kokkos::parallel_scan (my_exec_space(0, nv + 1), // PPS(lower_count)); - KokkosKernels::Impl::inclusive_parallel_prefix_sum< - size_type_temp_work_view_t, ExecutionSpace>(nv + 1, lower_count); - nnz_lno_persistent_work_view_t half_src( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), - new_num_edge); - nnz_lno_persistent_work_view_t half_dst( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), - new_num_edge); + KokkosKernels::Impl::inclusive_parallel_prefix_sum(nv + 1, + lower_count); + nnz_lno_persistent_work_view_t half_src(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"), + new_num_edge); + nnz_lno_persistent_work_view_t half_dst(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"), + new_num_edge); Kokkos::parallel_for( "KokkosGraph::FillLowerTriangleTeam", my_exec_space(0, nv), - FillLowerTriangle( - nv, xadj, adj, lower_count, half_src, half_dst)); + FillLowerTriangle(nv, xadj, adj, lower_count, half_src, half_dst)); src = lower_triangle_src = half_src; dst = lower_triangle_dst = half_dst; @@ -547,8 +496,7 @@ class GraphColoringHandle { nnz_lno_t get_num_colors() { if (num_colors == 0) { typedef typename Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce("KokkosKernels::FindMax", - my_exec_space(0, vertex_colors.extent(0)), + Kokkos::parallel_reduce("KokkosKernels::FindMax", my_exec_space(0, vertex_colors.extent(0)), ReduceMaxFunctor(vertex_colors), num_colors); } return num_colors; @@ -594,47 +542,23 @@ class GraphColoringHandle { virtual ~GraphColoringHandle(){}; // getters - ColoringAlgorithm get_coloring_algo_type() const { - return this->coloring_algorithm_type; - } - ConflictList get_conflict_list_type() const { - return this->conflict_list_type; - } - double get_min_reduction_for_conflictlist() const { - return this->min_reduction_for_conflictlist; - } - int get_min_elements_for_conflictlist() const { - return this->min_elements_for_conflictlist; - } - bool get_serial_conflict_resolution() const { - return this->serial_conflict_resolution; - } + ColoringAlgorithm get_coloring_algo_type() const { return this->coloring_algorithm_type; } + ConflictList get_conflict_list_type() const { return this->conflict_list_type; } + double get_min_reduction_for_conflictlist() const { return this->min_reduction_for_conflictlist; } + int get_min_elements_for_conflictlist() const { return this->min_elements_for_conflictlist; } + bool get_serial_conflict_resolution() const { return this->serial_conflict_resolution; } bool get_tictoc() const { return this->tictoc; } bool get_vb_edge_filtering() const { return this->vb_edge_filtering; } int get_vb_chunk_size() const { return this->vb_chunk_size; } - int get_max_number_of_iterations() const { - return this->max_number_of_iterations; - } + int get_max_number_of_iterations() const { return this->max_number_of_iterations; } int get_eb_num_initial_colors() const { return this->eb_num_initial_colors; } - double get_overall_coloring_time() const { - return this->overall_coloring_time; - } - double get_overall_coloring_time_phase1() const { - return this->overall_coloring_time_phase1; - } - double get_overall_coloring_time_phase2() const { - return this->overall_coloring_time_phase2; - } - double get_overall_coloring_time_phase3() const { - return this->overall_coloring_time_phase3; - } - double get_overall_coloring_time_phase4() const { - return this->overall_coloring_time_phase4; - } - double get_overall_coloring_time_phase5() const { - return this->overall_coloring_time_phase5; - } + double get_overall_coloring_time() const { return this->overall_coloring_time; } + double get_overall_coloring_time_phase1() const { return this->overall_coloring_time_phase1; } + double get_overall_coloring_time_phase2() const { return this->overall_coloring_time_phase2; } + double get_overall_coloring_time_phase3() const { return this->overall_coloring_time_phase3; } + double get_overall_coloring_time_phase4() const { return this->overall_coloring_time_phase4; } + double get_overall_coloring_time_phase5() const { return this->overall_coloring_time_phase5; } double get_coloring_time() const { return this->coloring_time; } int get_num_phases() const { return this->num_phases; } color_view_t get_vertex_colors() const { return this->vertex_colors; } @@ -643,44 +567,28 @@ class GraphColoringHandle { nnz_lno_temp_work_view_t get_vertex_list() const { return this->vertex_list; } size_type get_vertex_list_size() const { return this->vertex_list_size; } // setters - void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_, - size_type vertex_list_size_) { + void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_, size_type vertex_list_size_) { this->vertex_list = vertex_list_; this->vertex_list_size = vertex_list_size_; this->use_vtx_list = true; } - void set_coloring_algo_type(const ColoringAlgorithm &col_algo) { - this->coloring_algorithm_type = col_algo; - } - void set_conflict_list_type(const ConflictList &cl) { - this->conflict_list_type = cl; - } + void set_coloring_algo_type(const ColoringAlgorithm &col_algo) { this->coloring_algorithm_type = col_algo; } + void set_conflict_list_type(const ConflictList &cl) { this->conflict_list_type = cl; } void set_min_reduction_for_conflictlist(const double &min_reduction) { this->min_reduction_for_conflictlist = min_reduction; } void set_min_elements_for_conflictlist(const int &min_elements) { this->min_elements_for_conflictlist = min_elements; } - void set_serial_conflict_resolution( - const bool &use_serial_conflist_resolution) { + void set_serial_conflict_resolution(const bool &use_serial_conflist_resolution) { this->serial_conflict_resolution = use_serial_conflist_resolution; } void set_tictoc(const bool use_tictoc) { this->tictoc = use_tictoc; } - void set_vb_edge_filtering(const bool &use_vb_edge_filtering) { - this->vb_edge_filtering = use_vb_edge_filtering; - } - void set_vb_chunk_size(const int &chunksize) { - this->vb_chunk_size = chunksize; - } - void set_max_number_of_iterations(const int &max_phases) { - this->max_number_of_iterations = max_phases; - } - void set_eb_num_initial_colors(const int &num_initial_colors) { - this->eb_num_initial_colors = num_initial_colors; - } - void add_to_overall_coloring_time(const double &coloring_time_) { - this->overall_coloring_time += coloring_time_; - } + void set_vb_edge_filtering(const bool &use_vb_edge_filtering) { this->vb_edge_filtering = use_vb_edge_filtering; } + void set_vb_chunk_size(const int &chunksize) { this->vb_chunk_size = chunksize; } + void set_max_number_of_iterations(const int &max_phases) { this->max_number_of_iterations = max_phases; } + void set_eb_num_initial_colors(const int &num_initial_colors) { this->eb_num_initial_colors = num_initial_colors; } + void add_to_overall_coloring_time(const double &coloring_time_) { this->overall_coloring_time += coloring_time_; } void add_to_overall_coloring_time_phase1(const double &coloring_time_) { this->overall_coloring_time_phase1 += coloring_time_; } @@ -696,12 +604,8 @@ class GraphColoringHandle { void add_to_overall_coloring_time_phase5(const double &coloring_time_) { this->overall_coloring_time_phase5 += coloring_time_; } - void set_coloring_time(const double &coloring_time_) { - this->coloring_time = coloring_time_; - } - void set_num_phases(const double &num_phases_) { - this->num_phases = num_phases_; - } + void set_coloring_time(const double &coloring_time_) { this->coloring_time = coloring_time_; } + void set_num_phases(const double &num_phases_) { this->num_phases = num_phases_; } void set_vertex_colors(const color_view_t vertex_colors_) { this->vertex_colors = vertex_colors_; this->is_coloring_called_before = true; diff --git a/graph/src/KokkosGraph_Distance2Color.hpp b/graph/src/KokkosGraph_Distance2Color.hpp index c40ec72ece..a6555915bb 100644 --- a/graph/src/KokkosGraph_Distance2Color.hpp +++ b/graph/src/KokkosGraph_Distance2Color.hpp @@ -44,16 +44,13 @@ namespace Experimental { */ template -void graph_color_distance2(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_verts, - InRowmap row_map, InEntries row_entries) { +void graph_color_distance2(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_verts, InRowmap row_map, + InEntries row_entries) { using size_type = typename KernelHandle::size_type; using lno_t = typename KernelHandle::nnz_lno_t; - using InternalRowmap = Kokkos::View>; - using InternalEntries = Kokkos::View>; Kokkos::Timer timer; size_type nnz = row_entries.extent(0); @@ -61,11 +58,9 @@ void graph_color_distance2(KernelHandle *handle, InternalEntries rowentries_internal(row_entries.data(), nnz); auto gch_d2 = handle->get_distance2_graph_coloring_handle(); // note: last template argument 'false' means do distance-2, not bipartite - KokkosGraph::Impl::GraphColorDistance2< - typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, - InternalEntries, false> - gc(num_verts, num_verts, rowmap_internal, rowentries_internal, - rowmap_internal, rowentries_internal, gch_d2); + KokkosGraph::Impl::GraphColorDistance2 + gc(num_verts, num_verts, rowmap_internal, rowentries_internal, rowmap_internal, rowentries_internal, gch_d2); gc.compute_distance2_color(); gch_d2->add_to_overall_coloring_time(timer.seconds()); gch_d2->set_coloring_time(timer.seconds()); @@ -104,24 +99,18 @@ void graph_color_distance2(KernelHandle *handle, */ template -void bipartite_color_rows(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_columns, - InRowmap row_map, InEntries row_entries, +void bipartite_color_rows(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t num_columns, InRowmap row_map, InEntries row_entries, bool is_symmetric = false) { using execution_space = typename KernelHandle::HandleExecSpace; using size_type = typename KernelHandle::size_type; using lno_t = typename KernelHandle::nnz_lno_t; - using InternalRowmap = Kokkos::View>; - using InternalEntries = Kokkos::View>; - using TRowmap = Kokkos::View; - using TEntries = Kokkos::View; + using TRowmap = Kokkos::View; + using TEntries = Kokkos::View; Kokkos::Timer timer; size_type nnz = row_entries.extent(0); TRowmap col_map; @@ -130,8 +119,7 @@ void bipartite_color_rows(KernelHandle *handle, // Compute the transpose col_map = TRowmap("Col map", num_columns + 1); col_entries = TEntries("Col entries", nnz); - KokkosSparse::Impl::transpose_graph( + KokkosSparse::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); } InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0)); @@ -147,11 +135,9 @@ void bipartite_color_rows(KernelHandle *handle, } auto gch_d2 = handle->get_distance2_graph_coloring_handle(); // note: last template argument 'true' means do bipartite one-sided - KokkosGraph::Impl::GraphColorDistance2< - typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, - InternalEntries, true> - gc(num_rows, num_columns, rowmap_internal, rowentries_internal, - colmap_internal, colentries_internal, gch_d2); + KokkosGraph::Impl::GraphColorDistance2 + gc(num_rows, num_columns, rowmap_internal, rowentries_internal, colmap_internal, colentries_internal, gch_d2); gc.compute_distance2_color(); gch_d2->add_to_overall_coloring_time(timer.seconds()); gch_d2->set_coloring_time(timer.seconds()); @@ -185,31 +171,23 @@ void bipartite_color_rows(KernelHandle *handle, * return a view of length num_columns, containing the colors. */ template -void bipartite_color_columns(KernelHandle *handle, - typename KernelHandle::nnz_lno_t num_rows, - typename KernelHandle::nnz_lno_t num_columns, - InRowmap row_map, InEntries row_entries) { +void bipartite_color_columns(KernelHandle *handle, typename KernelHandle::nnz_lno_t num_rows, + typename KernelHandle::nnz_lno_t num_columns, InRowmap row_map, InEntries row_entries) { using execution_space = typename KernelHandle::HandleExecSpace; using size_type = typename KernelHandle::size_type; using lno_t = typename KernelHandle::nnz_lno_t; - using InternalRowmap = Kokkos::View>; - using InternalEntries = Kokkos::View>; - using TRowmap = Kokkos::View; - using TEntries = Kokkos::View; + using TRowmap = Kokkos::View; + using TEntries = Kokkos::View; Kokkos::Timer timer; size_type nnz = row_entries.extent(0); // Compute the transpose TRowmap col_map("Col map", num_columns + 1); - TEntries col_entries( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz); - KokkosSparse::Impl::transpose_graph( + TEntries col_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz); + KokkosSparse::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); // Get unmanaged views for both graph and its transpose InternalRowmap colmap_internal(col_map.data(), col_map.extent(0)); @@ -218,11 +196,9 @@ void bipartite_color_columns(KernelHandle *handle, InternalEntries rowentries_internal(row_entries.data(), nnz); auto gch_d2 = handle->get_distance2_graph_coloring_handle(); // note: last template argument 'true' means do bipartite one-sided - KokkosGraph::Impl::GraphColorDistance2< - typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, - InternalEntries, true> - gc(num_columns, num_rows, colmap_internal, colentries_internal, - rowmap_internal, rowentries_internal, gch_d2); + KokkosGraph::Impl::GraphColorDistance2 + gc(num_columns, num_rows, colmap_internal, colentries_internal, rowmap_internal, rowentries_internal, gch_d2); gc.compute_distance2_color(); gch_d2->add_to_overall_coloring_time(timer.seconds()); gch_d2->set_coloring_time(timer.seconds()); diff --git a/graph/src/KokkosGraph_Distance2ColorHandle.hpp b/graph/src/KokkosGraph_Distance2ColorHandle.hpp index c6508e0ba8..f50ce08fef 100644 --- a/graph/src/KokkosGraph_Distance2ColorHandle.hpp +++ b/graph/src/KokkosGraph_Distance2ColorHandle.hpp @@ -36,45 +36,37 @@ enum GraphColoringAlgorithmDistance2 { COLORING_D2_NB_BIT // Distance-2 Graph Coloring Net Based BIT }; -template +template class GraphColorDistance2Handle { public: - using HandleExecSpace = ExecutionSpace; - using HandleTempMemorySpace = TemporaryMemorySpace; - using HandlePersistentMemorySpace = PersistentMemorySpace; - using size_type = typename std::remove_const::type; - using const_size_type = const size_type; - using nnz_lno_type = typename std::remove_const::type; - using const_nnz_lno_type = const nnz_lno_type; - using color_type = typename std::remove_const::type; - using const_color_type = const color_type; - using color_view_type = - typename Kokkos::View; - using color_view_array_layout = typename color_view_type::array_layout; - using color_view_device_type = typename color_view_type::device_type; - using color_view_memory_traits = typename color_view_type::memory_traits; - using color_host_view_type = typename color_view_type::HostMirror; - using size_type_temp_work_view_type = - typename Kokkos::View; - using size_type_persistent_work_view_type = - typename Kokkos::View; - using size_type_persistent_work_host_view_type = - typename size_type_persistent_work_view_type::HostMirror; - using nnz_lno_temp_work_view_type = - typename Kokkos::View; - using nnz_lno_persistent_work_view_type = - typename Kokkos::View; - using nnz_lno_persistent_work_host_view_type = - typename nnz_lno_persistent_work_view_type::HostMirror; - using team_policy_type = Kokkos::TeamPolicy; - using team_member_type = typename team_policy_type::member_type; - using non_const_1d_size_type_view_type = typename Kokkos::View; + using HandleExecSpace = ExecutionSpace; + using HandleTempMemorySpace = TemporaryMemorySpace; + using HandlePersistentMemorySpace = PersistentMemorySpace; + using size_type = typename std::remove_const::type; + using const_size_type = const size_type; + using nnz_lno_type = typename std::remove_const::type; + using const_nnz_lno_type = const nnz_lno_type; + using color_type = typename std::remove_const::type; + using const_color_type = const color_type; + using color_view_type = typename Kokkos::View; + using color_view_array_layout = typename color_view_type::array_layout; + using color_view_device_type = typename color_view_type::device_type; + using color_view_memory_traits = typename color_view_type::memory_traits; + using color_host_view_type = typename color_view_type::HostMirror; + using size_type_temp_work_view_type = typename Kokkos::View; + using size_type_persistent_work_view_type = typename Kokkos::View; + using size_type_persistent_work_host_view_type = typename size_type_persistent_work_view_type::HostMirror; + using nnz_lno_temp_work_view_type = typename Kokkos::View; + using nnz_lno_persistent_work_view_type = typename Kokkos::View; + using nnz_lno_persistent_work_host_view_type = typename nnz_lno_persistent_work_view_type::HostMirror; + using team_policy_type = Kokkos::TeamPolicy; + using team_member_type = typename team_policy_type::member_type; + using non_const_1d_size_type_view_type = typename Kokkos::View; private: // Parameters - GraphColoringAlgorithmDistance2 - coloring_algorithm_type; // Which algorithm type to use. + GraphColoringAlgorithmDistance2 coloring_algorithm_type; // Which algorithm type to use. bool verbose; // verbosity flag bool tictoc; // print time at every step @@ -82,20 +74,20 @@ class GraphColorDistance2Handle { bool vb_edge_filtering; // whether to do edge filtering or not in vertex // based algorithms. - int vb_chunk_size; // the (minimum) size of the consecutive works that a - // thread will be assigned to. + int vb_chunk_size; // the (minimum) size of the consecutive works that a + // thread will be assigned to. int max_number_of_iterations; // maximum allowed number of phases that // STATISTICS - double overall_coloring_time; // The overall time taken to color the graph. - // In the case of the iterative calls. + double overall_coloring_time; // The overall time taken to color the graph. + // In the case of the iterative calls. double overall_coloring_time_phase1; // double overall_coloring_time_phase2; // double overall_coloring_time_phase3; // Some timer accumulators for internal // phases. double overall_coloring_time_phase4; // double overall_coloring_time_phase5; // - double coloring_time; // the time that it took to color the graph + double coloring_time; // the time that it took to color the graph bool use_vtx_list; nnz_lno_temp_work_view_type vertex_list; @@ -159,8 +151,7 @@ class GraphColorDistance2Handle { * * @return None */ - void set_algorithm(const GraphColoringAlgorithmDistance2& col_algo, - bool set_default_parameters = true) { + void set_algorithm(const GraphColoringAlgorithmDistance2& col_algo, bool set_default_parameters = true) { if (col_algo == COLORING_D2_DEFAULT) { this->choose_default_algorithm(); } else { @@ -182,26 +173,23 @@ class GraphColorDistance2Handle { */ void choose_default_algorithm() { - if (KokkosKernels::Impl::kk_get_exec_space_type() == - KokkosKernels::Impl::Exec_SERIAL) { + if (KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_D2_SERIAL; #ifdef VERBOSE - std::cout - << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; #endif } else { this->coloring_algorithm_type = COLORING_D2_NB_BIT; #ifdef VERBOSE - std::cout << ExecutionSpace::name() - << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; #endif } } nnz_lno_type get_num_colors() { if (num_colors == 0) - KokkosKernels::Impl::view_reduce_max( - vertex_colors.extent(0), vertex_colors, num_colors); + KokkosKernels::Impl::view_reduce_max(vertex_colors.extent(0), vertex_colors, + num_colors); return num_colors; } @@ -219,9 +207,7 @@ class GraphColorDistance2Handle { this->vb_chunk_size = 8; this->max_number_of_iterations = 200; break; - default: - throw std::runtime_error( - "Unknown Distance-2 Graph Coloring Algorithm\n"); + default: throw std::runtime_error("Unknown Distance-2 Graph Coloring Algorithm\n"); } } @@ -231,35 +217,19 @@ class GraphColorDistance2Handle { virtual ~GraphColorDistance2Handle(){}; // getters and setters - GraphColoringAlgorithmDistance2 get_coloring_algo_type() const { - return this->coloring_algorithm_type; - } + GraphColoringAlgorithmDistance2 get_coloring_algo_type() const { return this->coloring_algorithm_type; } bool get_verbose() const { return this->verbose; } double get_coloring_time() const { return this->coloring_time; } - int get_max_number_of_iterations() const { - return this->max_number_of_iterations; - } + int get_max_number_of_iterations() const { return this->max_number_of_iterations; } int get_num_phases() const { return this->num_phases; } - double get_overall_coloring_time() const { - return this->overall_coloring_time; - } - double get_overall_coloring_time_phase1() const { - return this->overall_coloring_time_phase1; - } - double get_overall_coloring_time_phase2() const { - return this->overall_coloring_time_phase2; - } - double get_overall_coloring_time_phase3() const { - return this->overall_coloring_time_phase3; - } - double get_overall_coloring_time_phase4() const { - return this->overall_coloring_time_phase4; - } - double get_overall_coloring_time_phase5() const { - return this->overall_coloring_time_phase5; - } + double get_overall_coloring_time() const { return this->overall_coloring_time; } + double get_overall_coloring_time_phase1() const { return this->overall_coloring_time_phase1; } + double get_overall_coloring_time_phase2() const { return this->overall_coloring_time_phase2; } + double get_overall_coloring_time_phase3() const { return this->overall_coloring_time_phase3; } + double get_overall_coloring_time_phase4() const { return this->overall_coloring_time_phase4; } + double get_overall_coloring_time_phase5() const { return this->overall_coloring_time_phase5; } bool get_tictoc() const { return this->tictoc; } @@ -272,14 +242,11 @@ class GraphColorDistance2Handle { bool is_coloring_called() const { return this->is_coloring_called_before; } bool get_use_vtx_list() const { return this->use_vtx_list; } - nnz_lno_temp_work_view_type get_vertex_list() const { - return this->vertex_list; - } + nnz_lno_temp_work_view_type get_vertex_list() const { return this->vertex_list; } size_type get_vertex_list_size() const { return this->vertex_list_size; } // setters - void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_, - size_type vertex_list_size_) { + void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_, size_type vertex_list_size_) { this->vertex_list = vertex_list_; this->vertex_list_size = vertex_list_size_; this->use_vtx_list = true; @@ -291,19 +258,11 @@ class GraphColorDistance2Handle { } void set_verbose(const bool verbose_) { this->verbose = verbose_; } - void set_coloring_time(const double& coloring_time_) { - this->coloring_time = coloring_time_; - } - void set_max_number_of_iterations(const int& max_phases) { - this->max_number_of_iterations = max_phases; - } - void set_num_phases(const int& num_phases_) { - this->num_phases = num_phases_; - } + void set_coloring_time(const double& coloring_time_) { this->coloring_time = coloring_time_; } + void set_max_number_of_iterations(const int& max_phases) { this->max_number_of_iterations = max_phases; } + void set_num_phases(const int& num_phases_) { this->num_phases = num_phases_; } - void add_to_overall_coloring_time(const double& coloring_time_) { - this->overall_coloring_time += coloring_time_; - } + void add_to_overall_coloring_time(const double& coloring_time_) { this->overall_coloring_time += coloring_time_; } void add_to_overall_coloring_time_phase1(const double& coloring_time_) { this->overall_coloring_time_phase1 += coloring_time_; } @@ -322,13 +281,9 @@ class GraphColorDistance2Handle { void set_tictoc(const bool use_tictoc) { this->tictoc = use_tictoc; } - void set_vb_chunk_size(const int& chunksize) { - this->vb_chunk_size = chunksize; - } + void set_vb_chunk_size(const int& chunksize) { this->vb_chunk_size = chunksize; } - void set_vb_edge_filtering(const bool& use_vb_edge_filtering) { - this->vb_edge_filtering = use_vb_edge_filtering; - } + void set_vb_edge_filtering(const bool& use_vb_edge_filtering) { this->vb_edge_filtering = use_vb_edge_filtering; } void set_vertex_colors(const color_view_type vertex_colors_) { this->vertex_colors = vertex_colors_; @@ -349,10 +304,8 @@ class GraphColorDistance2Handle { * object (i.e., `std::ofstream os("G.dot", std::ofstream::out);`) to write to * a file. */ - template - void dump_graphviz(std::ostream& os, const size_t num_verts, - rowmap_type& rowmap, entries_type& entries, + template + void dump_graphviz(std::ostream& os, const size_t num_verts, rowmap_type& rowmap, entries_type& entries, kokkos_view_type& colors) const { using h_colors_type = typename kokkos_view_type::HostMirror; using h_rowmap_type = typename rowmap_type::HostMirror; @@ -407,13 +360,11 @@ class GraphColorDistance2Handle { penwidth = ", penwidth=\"2.0\""; } - os << " " << vid << " [ label=\"" << vid << "|" << h_colors(vid) - << "\"" << style << fontcolor << color << fillcolor << penwidth << "];" - << std::endl; + os << " " << vid << " [ label=\"" << vid << "|" << h_colors(vid) << "\"" << style << fontcolor << color + << fillcolor << penwidth << "];" << std::endl; // Add the node's edges - for (size_t iadj = h_rowmap(vid); iadj < (size_t)h_rowmap(vid + 1); - iadj++) { + for (size_t iadj = h_rowmap(vid); iadj < (size_t)h_rowmap(vid + 1); iadj++) { size_t vadj = h_entries(iadj); if (vadj >= vid) { os << " " << vid << " -- " << vadj << ";" << std::endl; diff --git a/graph/src/KokkosGraph_ExplicitCoarsening.hpp b/graph/src/KokkosGraph_ExplicitCoarsening.hpp index 3c655026f5..67c4fbd453 100644 --- a/graph/src/KokkosGraph_ExplicitCoarsening.hpp +++ b/graph/src/KokkosGraph_ExplicitCoarsening.hpp @@ -32,35 +32,27 @@ namespace Experimental { // An uncompressed graph will still work as input to some things like D1 graph // coloring. -template -void graph_explicit_coarsen( - const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, - const labels_t& labels, - typename fine_entries_t::non_const_value_type numCoarseVerts, - coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, - bool compress = true) { +template +void graph_explicit_coarsen(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, + typename fine_entries_t::non_const_value_type numCoarseVerts, coarse_rowmap_t& coarseRowmap, + coarse_entries_t& coarseEntries, bool compress = true) { using size_type = typename fine_rowmap_t::non_const_value_type; using lno_t = typename fine_entries_t::non_const_value_type; using exec_space = typename device_t::execution_space; - static_assert( - std::is_same::value, - "graph_explicit_coarsen: The coarse and fine entry Views have different " - "value types."); - KokkosGraph::Impl::ExplicitGraphCoarsening< - lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, - coarse_rowmap_t, coarse_entries_t, coarse_entries_t> + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different " + "value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening egc(fineRowmap, fineEntries, labels, numCoarseVerts); coarseRowmap = egc.coarseRowmap; coarseEntries = egc.coarseEntries; if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosSparse::sort_and_merge_graph( - coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + KokkosSparse::sort_and_merge_graph(coarseRowmap, coarseEntries, + mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; } @@ -68,27 +60,22 @@ void graph_explicit_coarsen( // Same as above, but also produce the map from coarse vertices to fine vertices // (inverse map of labels) -template -void graph_explicit_coarsen_with_inverse_map( - const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, - const labels_t& labels, - typename fine_entries_t::non_const_value_type numCoarseVerts, - coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, - ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels, - bool compress = true) { +template +void graph_explicit_coarsen_with_inverse_map(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, + const labels_t& labels, + typename fine_entries_t::non_const_value_type numCoarseVerts, + coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries, + ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels, + bool compress = true) { using size_type = typename fine_rowmap_t::non_const_value_type; using lno_t = typename fine_entries_t::non_const_value_type; using exec_space = typename device_t::execution_space; - static_assert( - std::is_same::value, - "graph_explicit_coarsen: The coarse and fine entry Views have different " - "value types."); - KokkosGraph::Impl::ExplicitGraphCoarsening< - lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, - coarse_rowmap_t, coarse_entries_t, ordinal_view_t> + static_assert(std::is_same::value, + "graph_explicit_coarsen: The coarse and fine entry Views have different " + "value types."); + KokkosGraph::Impl::ExplicitGraphCoarsening egc(fineRowmap, fineEntries, labels, numCoarseVerts); coarseRowmap = egc.coarseRowmap; coarseEntries = egc.coarseEntries; @@ -97,9 +84,8 @@ void graph_explicit_coarsen_with_inverse_map( if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosSparse::sort_and_merge_graph( - coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + KokkosSparse::sort_and_merge_graph(coarseRowmap, coarseEntries, + mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; } diff --git a/graph/src/KokkosGraph_MIS2.hpp b/graph/src/KokkosGraph_MIS2.hpp index fb38d05456..4af491a406 100644 --- a/graph/src/KokkosGraph_MIS2.hpp +++ b/graph/src/KokkosGraph_MIS2.hpp @@ -30,21 +30,18 @@ enum MIS2_Algorithm { MIS2_QUALITY, MIS2_FAST }; template -lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, - MIS2_Algorithm algo = MIS2_FAST) { +lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST) { if (rowmap.extent(0) <= 1) { // zero vertices means the MIS is empty. return lno_view_t(); } switch (algo) { case MIS2_QUALITY: { - Impl::D2_MIS_FixedPriority mis( - rowmap, colinds); + Impl::D2_MIS_FixedPriority mis(rowmap, colinds); return mis.compute(); } case MIS2_FAST: { - Impl::D2_MIS_RandomPriority - mis(rowmap, colinds); + Impl::D2_MIS_RandomPriority mis(rowmap, colinds); return mis.compute(); } } @@ -53,16 +50,14 @@ lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, template -labels_t graph_mis2_coarsen( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numClusters) { +labels_t graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numClusters) { if (rowmap.extent(0) <= 1) { // there are no vertices to label numClusters = 0; return labels_t(); } - Impl::D2_MIS_Aggregation aggregation( - rowmap, colinds); + Impl::D2_MIS_Aggregation aggregation(rowmap, colinds); aggregation.compute(false); numClusters = aggregation.numAggs; return aggregation.labels; @@ -70,16 +65,14 @@ labels_t graph_mis2_coarsen( template -labels_t graph_mis2_aggregate( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numAggregates) { +labels_t graph_mis2_aggregate(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numAggregates) { if (rowmap.extent(0) <= 1) { // there are no vertices to label numAggregates = 0; return labels_t(); } - Impl::D2_MIS_Aggregation aggregation( - rowmap, colinds); + Impl::D2_MIS_Aggregation aggregation(rowmap, colinds); aggregation.compute(true); numAggregates = aggregation.numAggs; return aggregation.labels; @@ -101,31 +94,23 @@ namespace Experimental { template -[[deprecated]] lno_view_t graph_d2_mis(const rowmap_t& rowmap, - const colinds_t& colinds, +[[deprecated]] lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST) { - return KokkosGraph::graph_d2_mis( - rowmap, colinds, algo); + return KokkosGraph::graph_d2_mis(rowmap, colinds, algo); } template -[[deprecated]] labels_t graph_mis2_coarsen( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numClusters) { - return KokkosGraph::graph_mis2_coarsen(rowmap, colinds, - numClusters); +[[deprecated]] labels_t graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numClusters) { + return KokkosGraph::graph_mis2_coarsen(rowmap, colinds, numClusters); } template -[[deprecated]] labels_t graph_mis2_aggregate( - const rowmap_t& rowmap, const colinds_t& colinds, - typename colinds_t::non_const_value_type& numAggregates) { - return KokkosGraph::graph_mis2_aggregate(rowmap, colinds, - numAggregates); +[[deprecated]] labels_t graph_mis2_aggregate(const rowmap_t& rowmap, const colinds_t& colinds, + typename colinds_t::non_const_value_type& numAggregates) { + return KokkosGraph::graph_mis2_aggregate(rowmap, colinds, numAggregates); } [[deprecated]] inline const char* mis2_algorithm_name(MIS2_Algorithm algo) { diff --git a/graph/src/KokkosGraph_Triangle.hpp b/graph/src/KokkosGraph_Triangle.hpp index 0a878891ce..6ab6dd7b9a 100644 --- a/graph/src/KokkosGraph_Triangle.hpp +++ b/graph/src/KokkosGraph_Triangle.hpp @@ -148,15 +148,11 @@ transposeA, row_mapB, entriesB, transposeB); } */ -template -void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, - typename KernelHandle::nnz_lno_t k, - alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, - bool transposeA, blno_row_view_t_ row_mapB, - blno_nnz_view_t_ entriesB, bool transposeB, +void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, typename KernelHandle::nnz_lno_t n, + typename KernelHandle::nnz_lno_t k, alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, + bool transposeA, blno_row_view_t_ row_mapB, blno_nnz_view_t_ entriesB, bool transposeB, visit_struct_t visit_struct) { using namespace KokkosSparse; @@ -168,30 +164,24 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_IA: case SPGEMM_KK_TRIANGLE_IA_UNION: default: { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, alno_row_view_t_, alno_nnz_view_t_, - typename KernelHandle::in_scalar_nnz_view_t, blno_row_view_t_, - blno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t> - kspgemm(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, - entriesB, transposeB); + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } break; } } -template -void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, - visit_struct_t visit_struct) { +template +void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, alno_row_view_t_ row_mapA, + alno_nnz_view_t_ entriesA, visit_struct_t visit_struct) { typedef typename KernelHandle::nnz_lno_t nnz_lno_t; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType; - typedef typename KernelHandle::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; - typedef typename KernelHandle::row_lno_persistent_work_view_t - row_lno_persistent_work_view_t; + typedef typename KernelHandle::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; + typedef typename KernelHandle::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; typedef typename KernelHandle::HandleExecSpace ExecutionSpace; @@ -207,8 +197,8 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, should_i_sort = true; else if (sort_lower_triangle == 2) { size_type max_row_size = 0; - KokkosKernels::Impl::kk_view_reduce_max_row_size( - m, row_mapA.data(), row_mapA.data() + 1, max_row_size); + KokkosKernels::Impl::kk_view_reduce_max_row_size(m, row_mapA.data(), row_mapA.data() + 1, + max_row_size); if (max_row_size > 1000) { should_i_sort = true; @@ -217,13 +207,11 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, if (should_i_sort) { if (sh->get_lower_triangular_permutation().data() == NULL) { - nnz_lno_persistent_work_view_t new_indices( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_indices"), m); + nnz_lno_persistent_work_view_t new_indices(Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_indices"), m); int sort_decreasing_order = 1; ////If true we place the largest row to top, so that largest row size will /// be minimized in lower triangle. - if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_AI || - sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { + if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_AI || sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { sort_decreasing_order = 0; // if false we place the largest row to bottom, so that largest column // is minimizedin lower triangle. @@ -232,10 +220,8 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, // if 2, we do an interleaved sort. } { - KokkosSparse::Impl::kk_sort_by_row_size( - m, row_mapA.data(), new_indices.data(), sort_decreasing_order, - ExecutionSpace().concurrency()); + KokkosSparse::Impl::kk_sort_by_row_size( + m, row_mapA.data(), new_indices.data(), sort_decreasing_order, ExecutionSpace().concurrency()); } sh->set_lower_triangular_permutation(new_indices); } @@ -250,56 +236,43 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, row_lno_persistent_work_view_t lower_triangular_matrix_rowmap; nnz_lno_persistent_work_view_t lower_triangular_matrix_entries; timer1.reset(); - if (create_lower_triangular || - sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LL || + if (create_lower_triangular || sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LL || sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { - sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); - if (lower_triangular_matrix_rowmap.data() == NULL || - lower_triangular_matrix_entries.data() == NULL) { + sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); + if (lower_triangular_matrix_rowmap.data() == NULL || lower_triangular_matrix_entries.data() == NULL) { alno_nnz_view_t_ null_values; - nnz_lno_persistent_work_view_t new_indices = - sh->get_lower_triangular_permutation(); - - KokkosSparse::Impl::kk_get_lower_triangle< - alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( - m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, null_values, new_indices, - handle->is_dynamic_scheduling(), + nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); + + KokkosSparse::Impl::kk_get_lower_triangle( + m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, + null_values, new_indices, handle->is_dynamic_scheduling(), handle->get_team_work_size(1, ExecutionSpace().concurrency(), m)); - sh->set_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); + sh->set_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); } } if (handle->get_verbose()) { - std::cout << "Preprocess Create Lower Triangular Time:" << timer1.seconds() - << std::endl; + std::cout << "Preprocess Create Lower Triangular Time:" << timer1.seconds() << std::endl; } timer1.reset(); row_lno_persistent_work_view_t upper_triangular_matrix_rowmap; nnz_lno_persistent_work_view_t upper_triangular_matrix_entries; if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) { - sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); + sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); alno_nnz_view_t_ null_values; - nnz_lno_persistent_work_view_t new_indices = - sh->get_lower_triangular_permutation(); - - KokkosSparse::Impl::kk_get_lower_triangle< - alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( - m, row_mapA, entriesA, null_values, upper_triangular_matrix_rowmap, - upper_triangular_matrix_entries, null_values, new_indices, - handle->is_dynamic_scheduling(), 4, false); + nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); + + KokkosSparse::Impl::kk_get_lower_triangle( + m, row_mapA, entriesA, null_values, upper_triangular_matrix_rowmap, upper_triangular_matrix_entries, + null_values, new_indices, handle->is_dynamic_scheduling(), 4, false); } if (handle->get_verbose()) { - std::cout << "Preprocess Create Upper Triangular Time:" << timer1.seconds() - << std::endl; + std::cout << "Preprocess Create Upper Triangular Time:" << timer1.seconds() << std::endl; } /////////CREATE LOWER TRIANGLE/////// @@ -320,33 +293,25 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_IA: { // these are the algorithms that requires transpose of the incidence // matrix. - sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries); + sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries); - if (lower_triangular_matrix_rowmap.data() == NULL || - lower_triangular_matrix_entries.data() == NULL) { + if (lower_triangular_matrix_rowmap.data() == NULL || lower_triangular_matrix_entries.data() == NULL) { std::cout << "Creating lower triangular A" << std::endl; alno_nnz_view_t_ null_values; - nnz_lno_persistent_work_view_t new_indices = - sh->get_lower_triangular_permutation(); - - KokkosSparse::Impl::kk_get_lower_triangle< - alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( - m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, null_values, new_indices, - handle->is_dynamic_scheduling()); + nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); + + KokkosSparse::Impl::kk_get_lower_triangle( + m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, + null_values, new_indices, handle->is_dynamic_scheduling()); } - KokkosSparse::Impl:: - kk_create_incidence_tranpose_matrix_from_lower_triangle< - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - ExecutionSpace>( - m, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, incidence_transpose_rowmap, - incidence_transpose_entries, handle->is_dynamic_scheduling()); + KokkosSparse::Impl::kk_create_incidence_tranpose_matrix_from_lower_triangle< + row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, row_lno_persistent_work_view_t, + nnz_lno_persistent_work_view_t, ExecutionSpace>(m, lower_triangular_matrix_rowmap, + lower_triangular_matrix_entries, incidence_transpose_rowmap, + incidence_transpose_entries, handle->is_dynamic_scheduling()); } break; // IF it is one of below, we perform (A) or (L) x I @@ -355,12 +320,10 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, // these are the algorithms that requires the incidence matrix. KokkosSparse::Impl::kk_create_incidence_matrix_from_original_matrix< - alno_row_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - ExecutionSpace>(m, row_mapA, entriesA, incidence_rowmap, - incidence_entries, - sh->get_lower_triangular_permutation(), - handle->is_dynamic_scheduling()); + alno_row_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, + nnz_lno_persistent_work_view_t, ExecutionSpace>(m, row_mapA, entriesA, incidence_rowmap, incidence_entries, + sh->get_lower_triangular_permutation(), + handle->is_dynamic_scheduling()); } break; case SPGEMM_KK_TRIANGLE_LU: case SPGEMM_KK_TRIANGLE_LL: @@ -370,8 +333,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, } if (handle->get_verbose()) { - std::cout << "Preprocess Incidence Matrix Create Time:" << timer1.seconds() - << std::endl; + std::cout << "Preprocess Incidence Matrix Create Time:" << timer1.seconds() << std::endl; } //// /// CREATE INCIDENCE MATRIX END @@ -380,49 +342,36 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, switch (sh->get_algorithm_type()) { default: case SPGEMM_KK_TRIANGLE_LL: { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false, - lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false); + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false, + lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } break; case SPGEMM_KK_TRIANGLE_LU: { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false, - upper_triangular_matrix_rowmap, - upper_triangular_matrix_entries, false); + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false, + upper_triangular_matrix_rowmap, upper_triangular_matrix_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } break; case SPGEMM_KK_TRIANGLE_AI: { if (create_lower_triangular) { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, incidence_entries.extent(0) / 2, - lower_triangular_matrix_rowmap, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, incidence_entries.extent(0) / 2, lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false, // transpose ignore. incidence_rowmap, incidence_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } else { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, alno_row_view_t_, alno_nnz_view_t_, - nnz_lno_persistent_work_view_t, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t> - kspgemm(handle, m, m, incidence_entries.extent(0) / 2, row_mapA, - entriesA, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, m, m, incidence_entries.extent(0) / 2, row_mapA, entriesA, false, // transpose ignore. incidence_rowmap, incidence_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); @@ -433,24 +382,20 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_IA_UNION: case SPGEMM_KK_TRIANGLE_IA: { if (create_lower_triangular) { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t> - kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, - incidence_transpose_rowmap, incidence_transpose_entries, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, incidence_transpose_rowmap, + incidence_transpose_entries, false, // transpose ignore. - lower_triangular_matrix_rowmap, - lower_triangular_matrix_entries, false); + lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); } else { - KokkosSparse::Impl::KokkosSPGEMM< - KernelHandle, row_lno_persistent_work_view_t, - nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, - alno_row_view_t_, alno_nnz_view_t_, nnz_lno_persistent_work_view_t> - kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, - incidence_transpose_rowmap, incidence_transpose_entries, + KokkosSparse::Impl::KokkosSPGEMM + kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m, incidence_transpose_rowmap, + incidence_transpose_entries, false, // transpose ignore. row_mapA, entriesA, false); kspgemm.KokkosSPGEMM_generic_triangle(visit_struct); diff --git a/graph/unit_test/Test_Graph_coarsen.hpp b/graph/unit_test/Test_Graph_coarsen.hpp index 95f1533c88..2fda527dfb 100644 --- a/graph/unit_test/Test_Graph_coarsen.hpp +++ b/graph/unit_test/Test_Graph_coarsen.hpp @@ -47,23 +47,16 @@ bool verify_coarsening(typename coarsener_t::coarse_level_triple fine_l, using ordinal_t = typename entries_t::value_type; using edge_t = typename rowmap_t::value_type; - crsMat A = fine_l.mtx; - crsMat coarse_A = coarse_l.mtx; - auto f_rowmap = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); - auto c_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarse_A.graph.row_map); - auto f_entries = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); - auto vcmap = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), coarse_l.interp_mtx.graph.entries); - auto few = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); - auto cew = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.values); - auto fvw = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), fine_l.vtx_wgts); - auto cvw = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarse_l.vtx_wgts); + crsMat A = fine_l.mtx; + crsMat coarse_A = coarse_l.mtx; + auto f_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto c_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.graph.row_map); + auto f_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto vcmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_l.interp_mtx.graph.entries); + auto few = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + auto cew = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.values); + auto fvw = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), fine_l.vtx_wgts); + auto cvw = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_l.vtx_wgts); ordinal_t f_size = 0; ordinal_t c_size = 0; for (ordinal_t i = 0; i < static_cast(fvw.extent(0)); i++) { @@ -112,10 +105,8 @@ bool verify_is_graph(crsMat A) { using entries_t = typename c_entries_t::non_const_type; using ordinal_t = typename entries_t::value_type; using edge_t = typename rowmap_t::value_type; - auto rowmap = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); - auto entries = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); for (ordinal_t i = 0; i < A.numRows(); i++) { std::set adjset; @@ -158,8 +149,7 @@ bool verify_aggregator(crsMat A, crsMat agg) { if (A.numRows() < agg.numCols()) { return false; } - auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - agg.graph.entries); + auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), agg.graph.entries); std::vector aggregateSizes(agg.numCols(), 0); for (ordinal_t i = 0; i < static_cast(agg.nnz()); i++) { @@ -244,8 +234,7 @@ crsMat gen_grid() { template void test_multilevel_coarsen_grid() { - using crsMat = - KokkosSparse::CrsMatrix; + using crsMat = KokkosSparse::CrsMatrix; crsMat A = gen_grid(); using coarsener_t = coarse_builder; typename coarsener_t::coarsen_handle handle; @@ -259,17 +248,12 @@ void test_multilevel_coarsen_grid() { coarse++; while (coarse != levels.end()) { bool correct_aggregator = verify_aggregator(fine->mtx, coarse->interp_mtx); - EXPECT_TRUE(correct_aggregator) - << "Multilevel coarsening produced invalid aggregator on level " - << coarse->level - 1; + EXPECT_TRUE(correct_aggregator) << "Multilevel coarsening produced invalid aggregator on level " + << coarse->level - 1; bool correct_graph = verify_is_graph(coarse->mtx); bool correct_coarsening = verify_coarsening(*fine, *coarse); - EXPECT_TRUE(correct_graph) - << "Multilevel coarsening produced invalid graph on level " - << coarse->level; - EXPECT_TRUE(correct_coarsening) - << "Multilevel coarsening produced invalid coarsening on level " - << coarse->level; + EXPECT_TRUE(correct_graph) << "Multilevel coarsening produced invalid graph on level " << coarse->level; + EXPECT_TRUE(correct_coarsening) << "Multilevel coarsening produced invalid coarsening on level " << coarse->level; fine++; coarse++; } @@ -277,8 +261,7 @@ void test_multilevel_coarsen_grid() { template void test_coarsen_grid() { - using crsMat = - KokkosSparse::CrsMatrix; + using crsMat = KokkosSparse::CrsMatrix; using graph_type = typename crsMat::StaticCrsGraphType; using c_entries_t = typename graph_type::entries_type; using entries_t = typename c_entries_t::non_const_type; @@ -293,60 +276,49 @@ void test_coarsen_grid() { fine_A.vtx_wgts = vWgts; fine_A.level = 0; fine_A.uniform_weights = true; - std::vector heuristics = { - coarsener_t::HECv1, coarsener_t::Match, coarsener_t::MtMetis, - coarsener_t::MIS2, coarsener_t::GOSHv1, coarsener_t::GOSHv2}; - std::vector builders = { - coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, - coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; + std::vector heuristics = {coarsener_t::HECv1, coarsener_t::Match, + coarsener_t::MtMetis, coarsener_t::MIS2, + coarsener_t::GOSHv1, coarsener_t::GOSHv2}; + std::vector builders = {coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, + coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; for (auto h : heuristics) { - handle.h = h; - crsMat aggregator = - coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); + handle.h = h; + crsMat aggregator = coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator); - EXPECT_TRUE(correct_aggregator) - << "Aggregation heuristic " << static_cast(h) - << " produced invalid aggregator."; + EXPECT_TRUE(correct_aggregator) << "Aggregation heuristic " << static_cast(h) + << " produced invalid aggregator."; for (auto b : builders) { - handle.b = b; - clt coarse_A = - coarsener_t::build_coarse_graph(handle, fine_A, aggregator); - bool correct_graph = verify_is_graph(coarse_A.mtx); - bool correct_coarsening = - verify_coarsening(fine_A, coarse_A); - EXPECT_TRUE(correct_graph) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid graph with aggregation heuristic " - << static_cast(h) << "."; - EXPECT_TRUE(correct_coarsening) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid coarsening with aggregation heuristic " - << static_cast(h) << "."; + handle.b = b; + clt coarse_A = coarsener_t::build_coarse_graph(handle, fine_A, aggregator); + bool correct_graph = verify_is_graph(coarse_A.mtx); + bool correct_coarsening = verify_coarsening(fine_A, coarse_A); + EXPECT_TRUE(correct_graph) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid graph with aggregation heuristic " << static_cast(h) << "."; + EXPECT_TRUE(correct_coarsening) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid coarsening with aggregation heuristic " + << static_cast(h) << "."; } } } template -void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using svt = typename crsMat::values_type; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using svt = typename crsMat::values_type; // Generate graph - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); graph_type GS(symEntries, symRowmap); svt symValues("sym values", symEntries.extent(0)); @@ -362,88 +334,65 @@ void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, fine_A.vtx_wgts = vWgts; fine_A.level = 0; fine_A.uniform_weights = true; - std::vector heuristics = { - coarsener_t::HECv1, coarsener_t::Match, coarsener_t::MtMetis, - coarsener_t::MIS2, coarsener_t::GOSHv1, coarsener_t::GOSHv2}; - std::vector builders = { - coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, - coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; + std::vector heuristics = {coarsener_t::HECv1, coarsener_t::Match, + coarsener_t::MtMetis, coarsener_t::MIS2, + coarsener_t::GOSHv1, coarsener_t::GOSHv2}; + std::vector builders = {coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, + coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; for (auto h : heuristics) { - handle.h = h; - crsMat aggregator = - coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); + handle.h = h; + crsMat aggregator = coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator); - EXPECT_TRUE(correct_aggregator) - << "Aggregation heuristic " << static_cast(h) - << " produced invalid aggregator."; + EXPECT_TRUE(correct_aggregator) << "Aggregation heuristic " << static_cast(h) + << " produced invalid aggregator."; for (auto b : builders) { - handle.b = b; - clt coarse_A = - coarsener_t::build_coarse_graph(handle, fine_A, aggregator); - bool correct_graph = verify_is_graph(coarse_A.mtx); - bool correct_coarsening = - verify_coarsening(fine_A, coarse_A); - EXPECT_TRUE(correct_graph) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid graph with aggregation heuristic " - << static_cast(h) << "."; - EXPECT_TRUE(correct_coarsening) - << "Coarsening with dedupe method " << static_cast(b) - << " produced invalid coarsening with aggregation heuristic " - << static_cast(h) << "."; + handle.b = b; + clt coarse_A = coarsener_t::build_coarse_graph(handle, fine_A, aggregator); + bool correct_graph = verify_is_graph(coarse_A.mtx); + bool correct_coarsening = verify_coarsening(fine_A, coarse_A); + EXPECT_TRUE(correct_graph) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid graph with aggregation heuristic " << static_cast(h) << "."; + EXPECT_TRUE(correct_coarsening) << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid coarsening with aggregation heuristic " + << static_cast(h) << "."; } } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##random_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coarsen_random(5000, 5000 * 20, \ - 1000, 10); \ - test_coarsen_random(50, 50 * 10, 40, 10); \ - test_coarsen_random(5, 5 * 3, 5, 0); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##grid_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coarsen_grid(); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##grid_graph_multilevel_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_multilevel_coarsen_grid(); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##random_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coarsen_random(5000, 5000 * 20, 1000, 10); \ + test_coarsen_random(50, 50 * 10, 40, 10); \ + test_coarsen_random(5, 5 * 3, 5, 0); \ + } \ + TEST_F(TestCategory, graph##_##grid_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coarsen_grid(); \ + } \ + TEST_F(TestCategory, graph##_##grid_graph_multilevel_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_multilevel_coarsen_grid(); \ } // FIXME_SYCL #ifndef KOKKOS_ENABLE_SYCL #if defined(KOKKOSKERNELS_INST_DOUBLE) -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_graph_color.hpp b/graph/unit_test/Test_Graph_graph_color.hpp index 101c489bc0..3ddfa7c9b0 100644 --- a/graph/unit_test/Test_Graph_graph_color.hpp +++ b/graph/unit_test/Test_Graph_graph_color.hpp @@ -32,11 +32,8 @@ using namespace KokkosGraph::Experimental; namespace Test { template -int run_graphcolor( - crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, - size_t &num_colors, - typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type - &vertex_colors) { +int run_graphcolor(crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, size_t &num_colors, + typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type &vertex_colors) { typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -46,9 +43,8 @@ int run_graphcolor( typedef typename lno_nnz_view_t::value_type lno_t; typedef typename scalar_view_t::value_type scalar_t; - typedef KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> + typedef KokkosKernelsHandle KernelHandle; KernelHandle kh; @@ -60,9 +56,8 @@ int run_graphcolor( const size_t num_rows_1 = input_mat.numRows(); const size_t num_cols_1 = input_mat.numCols(); - graph_color( - &kh, num_rows_1, num_cols_1, input_mat.graph.row_map, - input_mat.graph.entries); + graph_color(&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, + input_mat.graph.entries); num_colors = kh.get_graph_coloring_handle()->get_num_colors(); vertex_colors = kh.get_graph_coloring_handle()->get_vertex_colors(); @@ -72,14 +67,10 @@ int run_graphcolor( } // namespace Test -template -void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + typedef typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -87,28 +78,24 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, typedef typename crsMat_t::values_type::non_const_type scalar_view_t; // typedef typename lno_view_t::non_const_value_type size_type; - lno_t numCols = numRows; - crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, row_size_variance, bandwidth); + lno_t numCols = numRows; + crsMat_t input_mat = + KokkosSparse::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, row_size_variance, bandwidth); typename lno_view_t::non_const_type sym_xadj; typename lno_nnz_view_t::non_const_type sym_adj; KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - lno_view_t, lno_nnz_view_t, typename lno_view_t::non_const_type, - typename lno_nnz_view_t::non_const_type, - typename device::execution_space>(numRows, input_mat.graph.row_map, - input_mat.graph.entries, sym_xadj, - sym_adj); + lno_view_t, lno_nnz_view_t, typename lno_view_t::non_const_type, typename lno_nnz_view_t::non_const_type, + typename device::execution_space>(numRows, input_mat.graph.row_map, input_mat.graph.entries, sym_xadj, sym_adj); size_type numentries = sym_adj.extent(0); scalar_view_t newValues("vals", numentries); graph_t static_graph(sym_adj, sym_xadj); input_mat = crsMat_t("CrsMatrix", numCols, newValues, static_graph); - std::vector coloring_algorithms = { - COLORING_DEFAULT, COLORING_SERIAL, COLORING_VB, COLORING_VBBIT, - COLORING_VBCS}; + std::vector coloring_algorithms = {COLORING_DEFAULT, COLORING_SERIAL, COLORING_VB, COLORING_VBBIT, + COLORING_VBCS}; // FIXME: VBD sometimes fails on CUDA and HIP #if defined(KOKKOS_ENABLE_CUDA) @@ -125,8 +112,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, // FIXME SYCL: re-enable this when EB is working #ifdef KOKKOS_ENABLE_SYCL - if (!std::is_same::value) { + if (!std::is_same::value) { coloring_algorithms.push_back(COLORING_EB); } #else @@ -140,28 +126,22 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Timer timer1; crsMat_t output_mat; - int res = run_graphcolor(input_mat, coloring_algorithm, - num_colors, vector_colors); + int res = run_graphcolor(input_mat, coloring_algorithm, num_colors, vector_colors); // double coloring_time = timer1.seconds(); EXPECT_TRUE((res == 0)); const lno_t num_rows_1 = input_mat.numRows(); const lno_t num_cols_1 = input_mat.numCols(); - lno_t num_conflict = KokkosSparse::Impl::kk_is_d1_coloring_valid< - lno_view_t, lno_nnz_view_t, color_view_t, - typename device::execution_space>( - num_rows_1, num_cols_1, input_mat.graph.row_map, - input_mat.graph.entries, vector_colors); + lno_t num_conflict = KokkosSparse::Impl::kk_is_d1_coloring_valid( + num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, vector_colors); lno_t conf = 0; { // also check the correctness of the validation code :) - typename lno_view_t::HostMirror hrm = - Kokkos::create_mirror_view(input_mat.graph.row_map); - typename lno_nnz_view_t::HostMirror hentries = - Kokkos::create_mirror_view(input_mat.graph.entries); - typename color_view_t::HostMirror hcolor = - Kokkos::create_mirror_view(vector_colors); + typename lno_view_t::HostMirror hrm = Kokkos::create_mirror_view(input_mat.graph.row_map); + typename lno_nnz_view_t::HostMirror hentries = Kokkos::create_mirror_view(input_mat.graph.entries); + typename color_view_t::HostMirror hcolor = Kokkos::create_mirror_view(vector_colors); Kokkos::deep_copy(hrm, input_mat.graph.row_map); Kokkos::deep_copy(hentries, input_mat.graph.entries); Kokkos::deep_copy(hcolor, vector_colors); @@ -179,53 +159,39 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, } } } - EXPECT_TRUE((num_conflict == conf)) - << "Coloring algo " << (int)coloring_algorithm - << ": kk_is_d1_coloring_valid returned incorrect number of conflicts (" - << num_conflict << ", should be " << conf << ")"; - - EXPECT_TRUE((num_conflict == 0)) - << "Coloring algo " << (int)coloring_algorithm - << ": D1 coloring produced invalid coloring (" << num_conflict - << " conflicts)"; + EXPECT_TRUE((num_conflict == conf)) << "Coloring algo " << (int)coloring_algorithm + << ": kk_is_d1_coloring_valid returned incorrect number of conflicts (" + << num_conflict << ", should be " << conf << ")"; + + EXPECT_TRUE((num_conflict == 0)) << "Coloring algo " << (int)coloring_algorithm + << ": D1 coloring produced invalid coloring (" << num_conflict << " conflicts)"; } // device::execution_space::finalize(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - graph##_##graph_color##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring(50000, 50000 * 30, 200, \ - 10); \ - test_coloring(50000, 50000 * 30, 100, \ - 10); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring(50000, 50000 * 30, 200, 10); \ + test_coloring(50000, 50000 * 30, 100, 10); \ } -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif diff --git a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp index 7bd3c4cd40..87771de84f 100644 --- a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp +++ b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp @@ -32,11 +32,8 @@ using namespace KokkosGraph::Experimental; namespace Test { template -int run_graphcolor_deter( - crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, - size_t &num_colors, - typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type - &vertex_colors) { +int run_graphcolor_deter(crsMat_t input_mat, ColoringAlgorithm coloring_algorithm, size_t &num_colors, + typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type &vertex_colors) { typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -46,9 +43,8 @@ int run_graphcolor_deter( typedef typename lno_nnz_view_t::value_type lno_t; typedef typename scalar_view_t::value_type scalar_t; - typedef KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> + typedef KokkosKernelsHandle KernelHandle; KernelHandle kh; @@ -60,9 +56,8 @@ int run_graphcolor_deter( const size_t num_rows_1 = input_mat.numRows(); const size_t num_cols_1 = input_mat.numCols(); - graph_color( - &kh, num_rows_1, num_cols_1, input_mat.graph.row_map, - input_mat.graph.entries); + graph_color(&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, + input_mat.graph.entries); num_colors = kh.get_graph_coloring_handle()->get_num_colors(); vertex_colors = kh.get_graph_coloring_handle()->get_vertex_colors(); @@ -72,13 +67,10 @@ int run_graphcolor_deter( } // namespace Test -template +template void test_coloring_deterministic(lno_t numRows, size_type nnz) { using namespace Test; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + typedef typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -89,11 +81,9 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { lno_t numCols = numRows; typename lno_view_t::non_const_type xadj("xadj", numRows + 1); - typename lno_view_t::non_const_type::HostMirror h_xadj = - Kokkos::create_mirror_view(xadj); + typename lno_view_t::non_const_type::HostMirror h_xadj = Kokkos::create_mirror_view(xadj); typename lno_nnz_view_t::non_const_type adj("adj", nnz); - typename lno_nnz_view_t::non_const_type::HostMirror h_adj = - Kokkos::create_mirror_view(adj); + typename lno_nnz_view_t::non_const_type::HostMirror h_adj = Kokkos::create_mirror_view(adj); // Fill up the rowPtr array h_xadj(0) = 0; @@ -211,18 +201,15 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { size_t num_colors; Kokkos::Timer timer1; - int res = run_graphcolor_deter( - input_mat, coloring_algorithm, num_colors, vector_colors); + int res = run_graphcolor_deter(input_mat, coloring_algorithm, num_colors, vector_colors); EXPECT_TRUE((res == 0)); EXPECT_TRUE((num_colors == 2)); - size_type num_conflict = 0; - typename color_view_t::HostMirror h_vector_colors = - Kokkos::create_mirror_view(vector_colors); + size_type num_conflict = 0; + typename color_view_t::HostMirror h_vector_colors = Kokkos::create_mirror_view(vector_colors); Kokkos::deep_copy(h_vector_colors, vector_colors); - int exact_colors[18] = {2, 1, 2, 1, 1, 2, 1, 2, 2, - 1, 2, 1, 2, 1, 2, 1, 2, 1}; + int exact_colors[18] = {2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1}; for (lno_t vertexIdx = 0; vertexIdx < numRows; ++vertexIdx) { if (h_vector_colors(vertexIdx) != exact_colors[vertexIdx]) { @@ -235,39 +222,29 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_deterministic##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring_deterministic(18, 74); \ - test_coloring_deterministic(18, 74); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color_deterministic##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring_deterministic(18, 74); \ + test_coloring_deterministic(18, 74); \ } -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif diff --git a/graph/unit_test/Test_Graph_graph_color_distance2.hpp b/graph/unit_test/Test_Graph_graph_color_distance2.hpp index 44ddaed0bf..ac3bbb7a18 100644 --- a/graph/unit_test/Test_Graph_graph_color_distance2.hpp +++ b/graph/unit_test/Test_Graph_graph_color_distance2.hpp @@ -35,10 +35,8 @@ using namespace KokkosGraph::Experimental; namespace Test { // Verify that a distance-2 coloring is correct (all views must be hostspace) -template -bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, - const entries_t& entries, const colors_t& colors) { +template +bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, const entries_t& entries, const colors_t& colors) { // Just do the simplest possible neighbors-of-neighbors loop to find conflicts for (lno_t v = 0; v < numVerts; v++) { if (colors(v) == 0) { @@ -52,8 +50,7 @@ bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, if (nei1 < numVerts && nei1 != v) { // check for dist-1 conflict if (colors(v) == colors(nei1)) { - std::cout << "Dist-1 conflict between " << v << " and " << nei1 - << '\n'; + std::cout << "Dist-1 conflict between " << v << " and " << nei1 << '\n'; return false; } // iterate over dist-2 neighbors @@ -63,8 +60,7 @@ bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, lno_t nei2 = entries(j); if (nei2 < numVerts && nei2 != v) { if (colors(v) == colors(nei2)) { - std::cout << "Dist-2 conflict between " << v << " and " << nei2 - << '\n'; + std::cout << "Dist-2 conflict between " << v << " and " << nei2 << '\n'; return false; } } @@ -75,14 +71,9 @@ bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap, return true; } -template -bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, - const rowmap_t& rowmap, - const entries_t& entries, - const rowmap_t& t_rowmap, - const entries_t& t_entries, - const colors_t& colors) { +template +bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, const rowmap_t& rowmap, const entries_t& entries, + const rowmap_t& t_rowmap, const entries_t& t_entries, const colors_t& colors) { // Just do the simplest possible neighbors-of-neighbors loop to find conflicts for (lno_t v = 0; v < numRows; v++) { if (colors(v) == 0) { @@ -101,8 +92,7 @@ bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, lno_t nei2 = t_entries(j); if (nei2 < numRows && nei2 != v) { if (colors(v) == colors(nei2)) { - std::cout << "Hyperedge conflict between " << v << " and " << nei2 - << '\n'; + std::cout << "Hyperedge conflict between " << v << " and " << nei2 << '\n'; return false; } } @@ -114,256 +104,189 @@ bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols, } } // namespace Test -template -void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using KernelHandle = - KokkosKernelsHandle; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using KernelHandle = KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); - std::vector algos = { - COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, - COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + std::vector algos = {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, + COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; for (auto algo : algos) { KernelHandle kh; kh.create_distance2_graph_coloring_handle(algo); // Compute the Distance-2 graph coloring. - graph_color_distance2( - &kh, numVerts, symRowmap, symEntries); + graph_color_distance2(&kh, numVerts, symRowmap, symEntries); execution_space().fence(); auto coloring_handle = kh.get_distance2_graph_coloring_handle(); auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); + auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); + auto numColors = coloring_handle->get_num_colors(); EXPECT_LE(numColors, numVerts); bool success = - Test::verifyD2Coloring( + Test::verifyD2Coloring( numVerts, rowmapHost, entriesHost, colorsHost); - EXPECT_TRUE(success) << "Dist-2: algorithm " - << coloring_handle->getD2AlgorithmName() + EXPECT_TRUE(success) << "Dist-2: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; kh.destroy_distance2_graph_coloring_handle(); } } -template -void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using KernelHandle = - KokkosKernelsHandle; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using KernelHandle = KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); - std::vector algos = { - COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, - COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + std::vector algos = {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, + COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; for (auto algo : algos) { KernelHandle kh; kh.create_distance2_graph_coloring_handle(algo); // Compute the Distance-2 graph coloring. - bipartite_color_rows( - &kh, numVerts, numVerts, symRowmap, symEntries, true); + bipartite_color_rows(&kh, numVerts, numVerts, symRowmap, symEntries, true); execution_space().fence(); auto coloring_handle = kh.get_distance2_graph_coloring_handle(); auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); + auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); + auto numColors = coloring_handle->get_num_colors(); EXPECT_LE(numColors, numVerts); - bool success = Test::verifyBipartitePartialColoring< - lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), - decltype(colorsHost)>(numVerts, numVerts, rowmapHost, entriesHost, - rowmapHost, entriesHost, colorsHost); - EXPECT_TRUE(success) << "Dist-2: algorithm " - << coloring_handle->getD2AlgorithmName() + bool success = Test::verifyBipartitePartialColoring( + numVerts, numVerts, rowmapHost, entriesHost, rowmapHost, entriesHost, colorsHost); + EXPECT_TRUE(success) << "Dist-2: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; kh.destroy_distance2_graph_coloring_handle(); } } -template -void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, - lno_t bandwidth, lno_t row_size_variance, bool colorRows) { +template +void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth, lno_t row_size_variance, + bool colorRows) { using execution_space = typename device::execution_space; using memory_space = typename device::memory_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using rowmap_t = typename graph_type::row_map_type::non_const_type; - using entries_t = typename graph_type::entries_type::non_const_type; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using KernelHandle = - KokkosKernelsHandle; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using rowmap_t = typename graph_type::row_map_type::non_const_type; + using entries_t = typename graph_type::entries_type::non_const_type; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using KernelHandle = KokkosKernelsHandle; // Generate graph - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, row_size_variance, bandwidth); - auto G = A.graph; + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, row_size_variance, bandwidth); + auto G = A.graph; rowmap_t t_rowmap("rowmap^T", numCols + 1); entries_t t_entries("entries^T", G.entries.extent(0)); - KokkosSparse::Impl::transpose_graph( + KokkosSparse::Impl::transpose_graph( numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); // TODO: remove me, shouldn't be needed even with UVM execution_space().fence(); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries); - auto t_rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap); - auto t_entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries); - std::vector algos = { - COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, - COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries); + auto t_rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap); + auto t_entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries); + std::vector algos = {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, + COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT}; for (auto algo : algos) { KernelHandle kh; kh.create_distance2_graph_coloring_handle(algo); // Compute the one-sided bipartite coloring. if (colorRows) { - bipartite_color_rows( - &kh, numRows, numCols, G.row_map, G.entries); + bipartite_color_rows(&kh, numRows, numCols, G.row_map, G.entries); } else { - bipartite_color_columns( - &kh, numRows, numCols, G.row_map, G.entries); + bipartite_color_columns(&kh, numRows, numCols, G.row_map, G.entries); } execution_space().fence(); auto coloring_handle = kh.get_distance2_graph_coloring_handle(); auto colors = coloring_handle->get_vertex_colors(); - auto colorsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); - auto numColors = coloring_handle->get_num_colors(); + auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors); + auto numColors = coloring_handle->get_num_colors(); bool success; if (colorRows) { EXPECT_LE(numColors, numRows); - success = Test::verifyBipartitePartialColoring< - lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), - decltype(colorsHost)>(numRows, numCols, rowmapHost, entriesHost, - t_rowmapHost, t_entriesHost, colorsHost); + success = Test::verifyBipartitePartialColoring(numRows, numCols, rowmapHost, entriesHost, + t_rowmapHost, t_entriesHost, colorsHost); } else { EXPECT_LE(numColors, numCols); - success = Test::verifyBipartitePartialColoring< - lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), - decltype(colorsHost)>(numCols, numRows, t_rowmapHost, t_entriesHost, - rowmapHost, entriesHost, colorsHost); + success = Test::verifyBipartitePartialColoring( + numCols, numRows, t_rowmapHost, t_entriesHost, rowmapHost, entriesHost, colorsHost); } - EXPECT_TRUE(success) << "Bipartite " << (colorRows ? "row" : "column") - << " coloring: algorithm " - << coloring_handle->getD2AlgorithmName() - << " produced invalid coloring"; + EXPECT_TRUE(success) << "Bipartite " << (colorRows ? "row" : "column") << " coloring: algorithm " + << coloring_handle->getD2AlgorithmName() << " produced invalid coloring"; kh.destroy_distance2_graph_coloring_handle(); } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_dist2_coloring(5000, 5000 * 20, \ - 1000, 10); \ - test_dist2_coloring(50, 50 * 10, 40, 10); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_bipartite_symmetric(50, 50 * 5, 30, \ - 1); \ - test_bipartite_symmetric(2000, 2000 * 20, \ - 800, 10); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_bipartite_row##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_bipartite(2000, 4000, 3000 * 20, \ - 800, 10, true); \ - test_bipartite(4000, 2000, 3000 * 20, \ - 800, 10, true); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_color_bipartite_col##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_bipartite(2000, 4000, 3000 * 20, \ - 800, 10, false); \ - test_bipartite(4000, 2000, 3000 * 20, \ - 800, 10, false); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_dist2_coloring(5000, 5000 * 20, 1000, 10); \ + test_dist2_coloring(50, 50 * 10, 40, 10); \ + } \ + TEST_F(TestCategory, graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_bipartite_symmetric(50, 50 * 5, 30, 1); \ + test_bipartite_symmetric(2000, 2000 * 20, 800, 10); \ + } \ + TEST_F(TestCategory, graph##_##graph_color_bipartite_row##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_bipartite(2000, 4000, 3000 * 20, 800, 10, true); \ + test_bipartite(4000, 2000, 3000 * 20, 800, 10, true); \ + } \ + TEST_F(TestCategory, graph##_##graph_color_bipartite_col##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_bipartite(2000, 4000, 3000 * 20, 800, 10, false); \ + test_bipartite(4000, 2000, 3000 * 20, 800, 10, false); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_mis2.hpp b/graph/unit_test/Test_Graph_mis2.hpp index c6fb7562e7..cd96badd44 100644 --- a/graph/unit_test/Test_Graph_mis2.hpp +++ b/graph/unit_test/Test_Graph_mis2.hpp @@ -34,10 +34,8 @@ enum CoarseningType { PHASE2, NO_PHASE2 }; namespace Test { -template -bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap, - const entries_t& entries, const mis_t& misArray) { +template +bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap, const entries_t& entries, const mis_t& misArray) { // set a std::set of the mis, for fast membership test std::set mis; for (size_t i = 0; i < misArray.extent(0); i++) mis.insert(misArray(i)); @@ -82,74 +80,58 @@ bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap, } } // namespace Test -template -void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); // For each algorithm, compute and verify the MIS std::vector algos = {MIS2_FAST, MIS2_QUALITY}; for (auto algo : algos) { - auto mis = KokkosGraph::graph_d2_mis( - symRowmap, symEntries, algo); - auto misHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); - bool success = Test::verifyD2MIS( + auto mis = KokkosGraph::graph_d2_mis(symRowmap, symEntries, algo); + auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis); + bool success = Test::verifyD2MIS( numVerts, rowmapHost, entriesHost, misHost); - EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int)algo - << ") produced invalid set."; + EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int)algo << ") produced invalid set."; } } -template -void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { +template +void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using execution_space = typename device::execution_space; - using crsMat = - KokkosSparse::CrsMatrix; - using graph_type = typename crsMat::StaticCrsGraphType; - using c_rowmap_t = typename graph_type::row_map_type; - using c_entries_t = typename graph_type::entries_type; - using rowmap_t = typename c_rowmap_t::non_const_type; - using entries_t = typename c_entries_t::non_const_type; - using labels_t = entries_t; + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using labels_t = entries_t; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numVerts, numVerts, nnz, row_size_variance, bandwidth); + crsMat A = + KokkosSparse::Impl::kk_generate_sparse_matrix(numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph rowmap_t symRowmap; entries_t symEntries; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< - c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap( numVerts, G.row_map, G.entries, symRowmap, symEntries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries); // For each algorithm, compute and verify the MIS std::vector algos = {PHASE2, NO_PHASE2}; for (auto algo : algos) { @@ -157,46 +139,34 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, labels_t labels; switch (algo) { case NO_PHASE2: - labels = KokkosGraph::graph_mis2_coarsen( - symRowmap, symEntries, numClusters); + labels = KokkosGraph::graph_mis2_coarsen(symRowmap, symEntries, numClusters); break; case PHASE2: - labels = KokkosGraph::graph_mis2_aggregate( - symRowmap, symEntries, numClusters); + labels = KokkosGraph::graph_mis2_aggregate(symRowmap, symEntries, numClusters); } - auto labelsHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels); + auto labelsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels); // Not a strong test, but sanity check the number of clusters returned EXPECT_TRUE(numClusters >= 1 && numClusters <= numVerts); // Check that every label is in the range [0, numClusters) - for (lno_t i = 0; i < numVerts; i++) - EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); + for (lno_t i = 0; i < numVerts; i++) EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); // Test explicit coarsening given the labels, with and without compressing // the result rowmap_t coarseRowmapNC, coarseRowmapC; entries_t coarseEntriesNC, coarseEntriesC; - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( - symRowmap, symEntries, labels, numClusters, coarseRowmapNC, - coarseEntriesNC, false); - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( - symRowmap, symEntries, labels, numClusters, coarseRowmapC, - coarseEntriesC, true); + KokkosGraph::Experimental::graph_explicit_coarsen( + symRowmap, symEntries, labels, numClusters, coarseRowmapNC, coarseEntriesNC, false); + KokkosGraph::Experimental::graph_explicit_coarsen( + symRowmap, symEntries, labels, numClusters, coarseRowmapC, coarseEntriesC, true); EXPECT_EQ(coarseRowmapC.extent(0), numClusters + 1); EXPECT_EQ(coarseRowmapNC.extent(0), numClusters + 1); // Check that coarse graph doesn't have more edges than fine graph EXPECT_LE(coarseEntriesC.extent(0), symEntries.extent(0)); EXPECT_LE(coarseEntriesNC.extent(0), symEntries.extent(0)); // Verify compression is working. - auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarseRowmapNC); - auto hostEntriesNC = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), coarseEntriesNC); - auto hostRowmapC = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC); - auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - coarseEntriesC); + auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapNC); + auto hostEntriesNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesNC); + auto hostRowmapC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC); + auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesC); for (lno_t i = 0; i < numClusters; i++) { // std::set maintains uniqueness as well as ascending order of elements. // So it should exactly match the entries in the compressed version. @@ -215,11 +185,9 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, } } -template +template void test_mis2_coarsening_zero_rows() { - using crsMat = - KokkosSparse::CrsMatrix; + using crsMat = KokkosSparse::CrsMatrix; using graph_type = typename crsMat::StaticCrsGraphType; using c_rowmap_t = typename graph_type::row_map_type; using c_entries_t = typename graph_type::entries_type; @@ -230,72 +198,55 @@ void test_mis2_coarsening_zero_rows() { // note: MIS2 coarsening first calls MIS2 on the fine graph, so this covers // the zero-row case for MIS2 alone. lno_t numClusters; - auto labels = KokkosGraph::graph_mis2_coarsen( - fineRowmap, fineEntries, numClusters); + auto labels = KokkosGraph::graph_mis2_coarsen(fineRowmap, fineEntries, numClusters); EXPECT_EQ(numClusters, 0); EXPECT_EQ(labels.extent(0), 0); // coarsen, should also produce a graph with 0 rows/entries rowmap_t coarseRowmap; entries_t coarseEntries; - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( + KokkosGraph::Experimental::graph_explicit_coarsen( fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, false); EXPECT_LE(coarseRowmap.extent(0), 1); EXPECT_EQ(coarseEntries.extent(0), 0); - KokkosGraph::Experimental::graph_explicit_coarsen< - device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>( + KokkosGraph::Experimental::graph_explicit_coarsen( fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, true); EXPECT_LE(coarseRowmap.extent(0), 1); EXPECT_EQ(coarseEntries.extent(0), 0); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_mis2(5000, 5000 * 20, 1000, 10); \ - test_mis2(50, 50 * 10, 40, 10); \ - test_mis2(5, 5 * 3, 5, 0); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_mis2_coarsening(5000, 5000 * 200, \ - 2000, 10); \ - test_mis2_coarsening(5000, 5000 * 20, \ - 1000, 10); \ - test_mis2_coarsening(50, 50 * 10, 40, \ - 10); \ - test_mis2_coarsening(5, 5 * 3, 5, 0); \ - test_mis2_coarsening_zero_rows(); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_mis2(5000, 5000 * 20, 1000, 10); \ + test_mis2(50, 50 * 10, 40, 10); \ + test_mis2(5, 5 * 3, 5, 0); \ + } \ + TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_mis2_coarsening(5000, 5000 * 200, 2000, 10); \ + test_mis2_coarsening(5000, 5000 * 20, 1000, 10); \ + test_mis2_coarsening(50, 50 * 10, 40, 10); \ + test_mis2_coarsening(5, 5 * 3, 5, 0); \ + test_mis2_coarsening_zero_rows(); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp index a6d165d8c3..0a9543367a 100644 --- a/graph/unit_test/Test_Graph_rcm.hpp +++ b/graph/unit_test/Test_Graph_rcm.hpp @@ -26,13 +26,10 @@ // Generates a graph from 3D 7-pt stencil. Slices grid into 2 connected // components near the middle of X dimension. template -void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, - int gridY, int gridZ) { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - auto getVertexID = [=](lno_t x, lno_t y, lno_t z) -> lno_t { - return x + y * gridX + z * gridX * gridY; - }; +void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, int gridY, int gridZ) { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + auto getVertexID = [=](lno_t x, lno_t y, lno_t z) -> lno_t { return x + y * gridX + z * gridX * gridY; }; lno_t numVertices = gridX * gridY * gridZ; // Generate the graph on host (use std::vector to not need to know // how many entries ahead of time) @@ -44,10 +41,8 @@ void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, for (lno_t j = 0; j < gridY; j++) { for (lno_t i = 0; i < gridX; i++) { lno_t v = getVertexID(i, j, k); - if (i != 0 && i != xslice + 1) - entries.push_back(getVertexID(i - 1, j, k)); - if (i != gridX - 1 && i != xslice) - entries.push_back(getVertexID(i + 1, j, k)); + if (i != 0 && i != xslice + 1) entries.push_back(getVertexID(i - 1, j, k)); + if (i != gridX - 1 && i != xslice) entries.push_back(getVertexID(i + 1, j, k)); if (j != 0) entries.push_back(getVertexID(i, j - 1, k)); if (j != gridY - 1) entries.push_back(getVertexID(i, j + 1, k)); if (k != 0) entries.push_back(getVertexID(i, j, k - 1)); @@ -59,26 +54,20 @@ void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, size_type numEdges = entries.size(); // Now that the graph is formed, copy rowmap and entries to Kokkos::Views in // device memory The nonowning host views just alias the std::vectors. - Kokkos::View> - rowmapHost(rowmap.data(), numVertices + 1); - Kokkos::View> - entriesHost(entries.data(), numEdges); + Kokkos::View> rowmapHost(rowmap.data(), + numVertices + 1); + Kokkos::View> entriesHost(entries.data(), + numEdges); // Allocate owning views on device with the correct size. - rowmapView = - rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), - numVertices + 1); - entriesView = entries_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Colinds"), numEdges); + rowmapView = rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numVertices + 1); + entriesView = entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Colinds"), numEdges); // Copy the graph from host to device Kokkos::deep_copy(rowmapView, rowmapHost); Kokkos::deep_copy(entriesView, entriesHost); } template -int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, - const labels_t& invPerm, const labels_t& perm) { +int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm) { using size_type = typename rowmap_t::non_const_value_type; using lno_t = typename entries_t::non_const_value_type; lno_t numVerts = std::max(1, rowmap.extent_int(0)) - 1; @@ -98,19 +87,14 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, } template -void test_rcm(const rowmap_t& rowmap, const entries_t& entries, - bool expectBandwidthReduced) { - using lno_t = typename entries_t::non_const_value_type; - auto rcm = KokkosGraph::Experimental::graph_rcm( - rowmap, entries); - auto rowmapHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); - auto entriesHost = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); - auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); - lno_t numVerts = std::max(rowmap.extent_int(0), 1) - 1; - decltype(rcmHost) rcmPermHost( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts); +void test_rcm(const rowmap_t& rowmap, const entries_t& entries, bool expectBandwidthReduced) { + using lno_t = typename entries_t::non_const_value_type; + auto rcm = KokkosGraph::Experimental::graph_rcm(rowmap, entries); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm); + lno_t numVerts = std::max(rowmap.extent_int(0), 1) - 1; + decltype(rcmHost) rcmPermHost(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts); for (lno_t i = 0; i < numVerts; i++) rcmPermHost(rcmHost(i)) = i; // make sure each row index shows up exactly once { @@ -124,20 +108,18 @@ void test_rcm(const rowmap_t& rowmap, const entries_t& entries, for (lno_t i = 0; i < numVerts; i++) ASSERT_EQ(counts[i], 1); } if (expectBandwidthReduced) { - Kokkos::View identityOrder( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts); + Kokkos::View identityOrder(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), + numVerts); for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i; - size_t origBW = - maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); - size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); + size_t origBW = maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder); + size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost); EXPECT_LE(rcmBW, origBW); } } template void test_rcm_zerorows() { - using graph_t = - Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -146,10 +128,8 @@ void test_rcm_zerorows() { } template -void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, - bool expectBandwidthReduced) { - using graph_t = - Kokkos::StaticCrsGraph; +void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, bool expectBandwidthReduced) { + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -160,8 +140,7 @@ void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, template void test_rcm_4clique() { - using graph_t = - Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap("rowmap", 5); @@ -177,20 +156,17 @@ void test_rcm_4clique() { template void test_rcm_multiple_components() { - using graph_t = - Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; // Generate a single 3D grid first rowmap_t rowmap_cube; entries_t entries_cube; generate7pt(rowmap_cube, entries_cube, 7, 7, 7); - auto rowmap_cube_host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap_cube); - auto entries_cube_host = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries_cube); - lno_t nv_cube = 7 * 7 * 7; - lno_t ne_cube = entries_cube.extent(0); + auto rowmap_cube_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap_cube); + auto entries_cube_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries_cube); + lno_t nv_cube = 7 * 7 * 7; + lno_t ne_cube = entries_cube.extent(0); // Now replicate the graph twice, so there are 2 disconnected copies of the // cube rowmap_t rowmap("rowmap", nv_cube * 2 + 1); @@ -214,55 +190,41 @@ void test_rcm_multiple_components() { test_rcm(rowmap, entries, true); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - graph##_##rcm_zerorows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_zerorows(); \ - } \ - TEST_F(TestCategory, \ - graph##_##rcm_7pt##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_7pt(1, 1, 1, false); \ - test_rcm_7pt(2, 1, 1, false); \ - test_rcm_7pt(6, 3, 3, true); \ - test_rcm_7pt(20, 20, 20, true); \ - test_rcm_7pt(100, 100, 1, true); \ - } \ - TEST_F(TestCategory, \ - graph##_##rcm_4clique##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_4clique(); \ - } \ - TEST_F( \ - TestCategory, \ - graph##_##rcm_multiple_components##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_rcm_multiple_components(); \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##rcm_zerorows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_zerorows(); \ + } \ + TEST_F(TestCategory, graph##_##rcm_7pt##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_7pt(1, 1, 1, false); \ + test_rcm_7pt(2, 1, 1, false); \ + test_rcm_7pt(6, 3, 3, true); \ + test_rcm_7pt(20, 20, 20, true); \ + test_rcm_7pt(100, 100, 1, true); \ + } \ + TEST_F(TestCategory, graph##_##rcm_4clique##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_4clique(); \ + } \ + TEST_F(TestCategory, graph##_##rcm_multiple_components##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_rcm_multiple_components(); \ } -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, int, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int, size_t, TestDevice) #endif -#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif diff --git a/lapack/impl/KokkosLapack_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp index 97d74280ff..60a69e72b3 100644 --- a/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -42,21 +42,17 @@ struct gesv_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct gesv_eti_spec_avail< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct gesv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -70,23 +66,19 @@ namespace Impl { /// \brief Implementation of KokkosLapack::gesv. template ::value, - bool eti_spec_avail = - gesv_eti_spec_avail::value> + bool tpl_spec_avail = gesv_tpl_spec_avail::value, + bool eti_spec_avail = gesv_eti_spec_avail::value> struct GESV { - static void gesv(const ExecutionSpace &space, const AMatrix &A, const BXMV &B, - const IPIVV &IPIV); + static void gesv(const ExecutionSpace &space, const AMatrix &A, const BXMV &B, const IPIVV &IPIV); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of gesv for multi vectors. // Unification layer template -struct GESV { - static void gesv(const ExecutionSpace & /* space */, const AMatrix & /* A */, - const BXMV & /* B */, const IPIVV & /* IPIV */) { +struct GESV { + static void gesv(const ExecutionSpace & /* space */, const AMatrix & /* A */, const BXMV & /* B */, + const IPIVV & /* IPIV */) { // NOTE: Might add the implementation of KokkosLapack::gesv later throw std::runtime_error( "No fallback implementation of GESV (general LU factorization & solve) " @@ -105,36 +97,26 @@ struct GESV, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct GESV< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; -#define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct GESV< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct GESV< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/lapack/impl/KokkosLapack_svd_spec.hpp b/lapack/impl/KokkosLapack_svd_spec.hpp index fc0a34f790..b0dfe3d091 100644 --- a/lapack/impl/KokkosLapack_svd_spec.hpp +++ b/lapack/impl/KokkosLapack_svd_spec.hpp @@ -28,8 +28,7 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct svd_eti_spec_avail { enum : bool { value = false }; }; @@ -43,24 +42,19 @@ struct svd_eti_spec_avail { // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct svd_eti_spec_avail< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct svd_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -73,29 +67,21 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosLapack::svd. -template ::value, - bool eti_spec_avail = svd_eti_spec_avail< - ExecutionSpace, AMatrix, SVector, UMatrix, VMatrix>::value> +template ::value, + bool eti_spec_avail = svd_eti_spec_avail::value> struct SVD { - static void svd(const ExecutionSpace &space, const char jobu[], - const char jobvt[], const AMatrix &A, const SVector &S, - const UMatrix &U, const VMatrix &Vt); + static void svd(const ExecutionSpace &space, const char jobu[], const char jobvt[], const AMatrix &A, + const SVector &S, const UMatrix &U, const VMatrix &Vt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of svd // Unification layer -template -struct SVD { - static void svd(const ExecutionSpace & /* space */, const char * /* jobu */, - const char * /* jobvt */, const AMatrix & /* A */, - const SVector & /* S */, const UMatrix & /* U */, - const VMatrix & /* Vt */) { +template +struct SVD { + static void svd(const ExecutionSpace & /* space */, const char * /* jobu */, const char * /* jobvt */, + const AMatrix & /* A */, const SVector & /* S */, const UMatrix & /* U */, const VMatrix & /* Vt */) { // NOTE: Might add the implementation of KokkosLapack::svd later throw std::runtime_error( "No fallback implementation of SVD (singular value decomposition) " @@ -115,40 +101,30 @@ struct SVD, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_SVD_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; -#define KOKKOSLAPACK_SVD_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct SVD< \ - EXEC_SPACE_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSLAPACK_SVD_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/lapack/impl/KokkosLapack_trtri_impl.hpp b/lapack/impl/KokkosLapack_trtri_impl.hpp index 9f52c2d412..5ba6f80eec 100644 --- a/lapack/impl/KokkosLapack_trtri_impl.hpp +++ b/lapack/impl/KokkosLapack_trtri_impl.hpp @@ -31,8 +31,7 @@ namespace KokkosLapack { namespace Impl { template -void SerialTrtri_Invoke(const RViewType &R, const char uplo[], - const char diag[], const AViewType &A) { +void SerialTrtri_Invoke(const RViewType &R, const char uplo[], const char diag[], const AViewType &A) { using KokkosBatched::Algo; using KokkosBatched::Diag; using KokkosBatched::SerialTrtriInternalLower; @@ -43,24 +42,20 @@ void SerialTrtri_Invoke(const RViewType &R, const char uplo[], //// Lower //// if (__uplo == 'l') { if (__diag == 'u') { - R() = SerialTrtriInternalLower::invoke( - Diag::Unit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalLower::invoke(Diag::Unit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } else { - R() = SerialTrtriInternalLower::invoke( - Diag::NonUnit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalLower::invoke(Diag::NonUnit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } } else { //// Upper //// if (__diag == 'u') { - R() = SerialTrtriInternalUpper::invoke( - Diag::Unit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalUpper::invoke(Diag::Unit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } else { - R() = SerialTrtriInternalUpper::invoke( - Diag::NonUnit::use_unit_diag, A.extent(0), A.extent(1), A.data(), - A.stride(0), A.stride(1)); + R() = SerialTrtriInternalUpper::invoke(Diag::NonUnit::use_unit_diag, A.extent(0), + A.extent(1), A.data(), A.stride(0), A.stride(1)); } } } diff --git a/lapack/impl/KokkosLapack_trtri_spec.hpp b/lapack/impl/KokkosLapack_trtri_spec.hpp index a17184dc41..ef458f7e57 100644 --- a/lapack/impl/KokkosLapack_trtri_spec.hpp +++ b/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -37,15 +37,13 @@ struct trtri_eti_spec_avail { // This Macros provides the ETI specialization of trtri, currently not // available. // -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct trtri_eti_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct trtri_eti_spec_avail< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -60,33 +58,28 @@ namespace Impl { // // Unification layer -template ::value, +template ::value, bool eti_spec_avail = trtri_eti_spec_avail::value> struct TRTRI { - static void trtri(const RVIT& R, const char uplo[], const char diag[], - const AVIT& A); + static void trtri(const RVIT& R, const char uplo[], const char diag[], const AVIT& A); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template struct TRTRI { - static void trtri(const RVIT& R, const char uplo[], const char diag[], - const AVIT& A) { + static void trtri(const RVIT& R, const char uplo[], const char diag[], const AVIT& A) { static_assert(Kokkos::is_view::value, "AVIT must be a Kokkos::View."); static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosLapack::trtri[ETI]" - : "KokkosLapack::trtri[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosLapack::trtri[ETI]" + : "KokkosLapack::trtri[noETI]"); typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A); typename RVIT::HostMirror host_R = Kokkos::create_mirror_view(R); Kokkos::deep_copy(host_A, A); - SerialTrtri_Invoke( - R, uplo, diag, host_A); + SerialTrtri_Invoke(R, uplo, diag, host_A); Kokkos::deep_copy(A, host_A); @@ -106,22 +99,18 @@ struct TRTRI { // "extern template" skips the implicit instatiation step ensuring that the // callers code uses this explicit instantiation definition of TRTRI. // -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ - extern template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ + extern template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ - template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ + template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp index b66583bbdf..281d6a5651 100644 --- a/lapack/src/KokkosLapack_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -53,44 +53,29 @@ namespace KokkosLapack { /// used. /// template -void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, - const IPIVV& IPIV) { +void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { // NOTE: Currently, KokkosLapack::gesv only supports LAPACK, MAGMA and // rocSOLVER TPLs. // MAGMA/rocSOLVER TPL should be enabled to call the MAGMA/rocSOLVER GPU // interface for device views LAPACK TPL should be enabled to call the // LAPACK interface for host views - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) if constexpr (!std::is_same_v) { - static_assert( - Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); } #else - static_assert( - Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); #endif - static_assert(Kokkos::is_view::value, - "KokkosLapack::gesv: A must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::gesv: B must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::gesv: IPIV must be a Kokkos::View."); - static_assert(static_cast(AMatrix::rank) == 2, - "KokkosLapack::gesv: A must have rank 2."); - static_assert( - static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, - "KokkosLapack::gesv: B must have either rank 1 or rank 2."); - static_assert(static_cast(IPIVV::rank) == 1, - "KokkosLapack::gesv: IPIV must have rank 1."); + static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: B must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: IPIV must be a Kokkos::View."); + static_assert(static_cast(AMatrix::rank) == 2, "KokkosLapack::gesv: A must have rank 2."); + static_assert(static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, + "KokkosLapack::gesv: B must have either rank 1 or rank 2."); + static_assert(static_cast(IPIVV::rank) == 1, "KokkosLapack::gesv: IPIV must have rank 1."); int64_t IPIV0 = IPIV.extent(0); int64_t A0 = A.extent(0); @@ -98,8 +83,7 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, int64_t B0 = B.extent(0); // Check validity of pivot argument - bool valid_pivot = - (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); + bool valid_pivot = (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); if (!(valid_pivot)) { std::ostringstream os; os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " @@ -112,9 +96,8 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, // Check for no pivoting case. Only MAGMA supports no pivoting interface #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL - if ((!std::is_same::value) && - (IPIV0 == 0) && (IPIV.data() == nullptr)) { + if ((!std::is_same::value) && (IPIV0 == 0) && + (IPIV.data() == nullptr)) { std::ostringstream os; os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " << "LAPACK TPL does not support no pivoting."; @@ -136,22 +119,18 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, if ((A0 < A1) || (A0 != B0)) { std::ostringstream os; os << "KokkosLapack::gesv: Dimensions of A, and B do not match: " - << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) - << " x " << B.extent(1); + << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) << " x " << B.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - typedef Kokkos::View< - typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, - typename AMatrix::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > AMatrix_Internal; - typedef Kokkos::View > BXMV_Internal; - typedef Kokkos::View< - typename IPIVV::non_const_value_type*, typename IPIVV::array_layout, - typename IPIVV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > IPIVV_Internal; AMatrix_Internal A_i = A; // BXMV_Internal B_i = B; @@ -159,12 +138,12 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, if (BXMV::rank == 1) { auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); - KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, + IPIV_i); } else { // BXMV::rank == 2 auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); - KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, + IPIV_i); } } diff --git a/lapack/src/KokkosLapack_svd.hpp b/lapack/src/KokkosLapack_svd.hpp index 71ea7cc30f..c0c962fb19 100644 --- a/lapack/src/KokkosLapack_svd.hpp +++ b/lapack/src/KokkosLapack_svd.hpp @@ -58,36 +58,21 @@ namespace KokkosLapack { /// vectors of A. /// // clang-format on -template -void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], - const AMatrix& A, const SVector& S, const UMatrix& U, - const VMatrix& Vt) { - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert( - Kokkos::SpaceAccessibility::accessible); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: A must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: S must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: U must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosLapack::svd: Vt must be a Kokkos::View."); +template +void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: S must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: U must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "KokkosLapack::svd: Vt must be a Kokkos::View."); static_assert(AMatrix::rank() == 2, "KokkosLapack::svd: A must have rank 2."); static_assert(SVector::rank() == 1, "KokkosLapack::svd: S must have rank 1."); static_assert(UMatrix::rank() == 2, "KokkosLapack::svd: U must have rank 2."); - static_assert(VMatrix::rank() == 2, - "KokkosLapack::svd: Vt must have rank 2."); + static_assert(VMatrix::rank() == 2, "KokkosLapack::svd: Vt must have rank 2."); int64_t m = A.extent(0); int64_t n = A.extent(1); @@ -102,40 +87,32 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], // Check the jobu and jobvt control flags // The only valid options there are 'A', 'S', 'O' and 'N' - const bool is_jobu_invalid = - !((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || - (jobu[0] == 's') || (jobu[0] == 'O') || (jobu[0] == 'o') || - (jobu[0] == 'N') || (jobu[0] == 'n')); + const bool is_jobu_invalid = !((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || (jobu[0] == 's') || + (jobu[0] == 'O') || (jobu[0] == 'o') || (jobu[0] == 'N') || (jobu[0] == 'n')); - const bool is_jobvt_invalid = - !((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || - (jobvt[0] == 's') || (jobvt[0] == 'O') || (jobvt[0] == 'o') || - (jobvt[0] == 'N') || (jobvt[0] == 'n')); + const bool is_jobvt_invalid = !((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || (jobvt[0] == 's') || + (jobvt[0] == 'O') || (jobvt[0] == 'o') || (jobvt[0] == 'N') || (jobvt[0] == 'n')); if (is_jobu_invalid && is_jobvt_invalid) { std::ostringstream oss; oss << "KokkosLapack::svd: both jobu and jobvt are invalid!\n" - << "Possible values are A, S, O or N, submitted values are " << jobu[0] - << " and " << jobvt[0] << "\n"; + << "Possible values are A, S, O or N, submitted values are " << jobu[0] << " and " << jobvt[0] << "\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); } if (is_jobu_invalid) { std::ostringstream oss; oss << "KokkosLapack::svd: jobu is invalid!\n" - << "Possible values are A, S, O or N, submitted value is " << jobu[0] - << "\n"; + << "Possible values are A, S, O or N, submitted value is " << jobu[0] << "\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); } if (is_jobvt_invalid) { std::ostringstream oss; oss << "KokkosLapack::svd: jobvt is invalid!\n" - << "Possible values are A, S, O or N, submitted value is " << jobvt[0] - << "\n"; + << "Possible values are A, S, O or N, submitted value is " << jobvt[0] << "\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); } - if (((jobu[0] == 'O') || (jobu[0] == 'o')) && - ((jobvt[0] == 'O') || (jobvt[0] == 'o'))) { + if (((jobu[0] == 'O') || (jobu[0] == 'o')) && ((jobvt[0] == 'O') || (jobvt[0] == 'o'))) { std::ostringstream oss; oss << "KokkosLapack::svd: jobu and jobvt cannot be O at the same time!\n"; KokkosKernels::Impl::throw_runtime_exception(oss.str()); @@ -148,23 +125,20 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], std::ostringstream os; if (S.extent_int(0) != rankA) { is_extent_invalid = true; - os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " - << rankA << ".\n"; + os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " << rankA << ".\n"; } - if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || - (jobu[0] == 's')) { + if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || (jobu[0] == 's')) { if (U.extent_int(0) != m || U.extent_int(1) != m) { is_extent_invalid = true; - os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " - << U.extent(1) << ") instead of (" << m << ", " << m << ").\n"; + os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " << U.extent(1) << ") instead of (" << m << ", " + << m << ").\n"; } } - if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || - (jobvt[0] == 's')) { + if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || (jobvt[0] == 's')) { if (Vt.extent_int(0) != n || Vt.extent_int(1) != n) { is_extent_invalid = true; - os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " - << Vt.extent(1) << ") instead of (" << n << ", " << n << ").\n"; + os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " << Vt.extent(1) << ") instead of (" << n + << ", " << n << ").\n"; } } if (is_extent_invalid) { @@ -172,8 +146,7 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], } #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) - if (std::is_same_v && - (A.extent(0) < A.extent(1))) { + if (std::is_same_v && (A.extent(0) < A.extent(1))) { throw std::runtime_error( "CUSOLVER does not support SVD for matrices with more columns " "than rows, you can transpose you matrix first then compute " @@ -182,32 +155,25 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], } #endif - using AMatrix_Internal = Kokkos::View< - typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, - typename AMatrix::device_type, Kokkos::MemoryTraits>; + using AMatrix_Internal = Kokkos::View>; - using SVector_Internal = Kokkos::View< - typename SVector::non_const_value_type*, typename SVector::array_layout, - typename SVector::device_type, Kokkos::MemoryTraits>; + using SVector_Internal = Kokkos::View>; - using UMatrix_Internal = Kokkos::View< - typename UMatrix::non_const_value_type**, typename UMatrix::array_layout, - typename UMatrix::device_type, Kokkos::MemoryTraits>; + using UMatrix_Internal = Kokkos::View>; - using VMatrix_Internal = Kokkos::View< - typename VMatrix::non_const_value_type**, typename VMatrix::array_layout, - typename VMatrix::device_type, Kokkos::MemoryTraits>; + using VMatrix_Internal = Kokkos::View>; AMatrix_Internal A_i = A; SVector_Internal S_i = S; UMatrix_Internal U_i = U; VMatrix_Internal Vt_i = Vt; - KokkosLapack::Impl::SVD::svd(space, jobu, - jobvt, A_i, - S_i, U_i, - Vt_i); + KokkosLapack::Impl::SVD::svd( + space, jobu, jobvt, A_i, S_i, U_i, Vt_i); } // clang-format off @@ -235,8 +201,8 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], /// // clang-format on template -void svd(const char jobu[], const char jobvt[], const AMatrix& A, - const SVector& S, const UMatrix& U, const VMatrix& Vt) { +void svd(const char jobu[], const char jobvt[], const AMatrix& A, const SVector& S, const UMatrix& U, + const VMatrix& Vt) { typename AMatrix::execution_space space{}; svd(space, jobu, jobvt, A, S, U, Vt); } diff --git a/lapack/src/KokkosLapack_trtri.hpp b/lapack/src/KokkosLapack_trtri.hpp index 9a884f2303..cfe311f476 100644 --- a/lapack/src/KokkosLapack_trtri.hpp +++ b/lapack/src/KokkosLapack_trtri.hpp @@ -49,16 +49,12 @@ namespace KokkosLapack { // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri template int trtri(const char uplo[], const char diag[], const AViewType& A) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); // Check validity of indicator argument - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || (uplo[0] == 'l'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || (diag[0] == 'n'); if (!valid_uplo) { std::ostringstream os; @@ -94,22 +90,17 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { } // Create A matrix view type alias - using AViewInternalType = - Kokkos::View >; + using AViewInternalType = Kokkos::View >; // This is the return value type and should always reside on host using RViewInternalType = - Kokkos::View >; + Kokkos::View >; int result; RViewInternalType R = RViewInternalType(&result); - KokkosLapack::Impl::TRTRI::trtri( - R, uplo, diag, A); + KokkosLapack::Impl::TRTRI::trtri(R, uplo, diag, A); return result; } diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 943d10d111..3ead12d5f4 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -24,8 +24,7 @@ namespace Impl { CudaLapackSingleton::CudaLapackSingleton() { cusolverStatus_t stat = cusolverDnCreate(&handle); - if (stat != CUSOLVER_STATUS_SUCCESS) - Kokkos::abort("CUSOLVER initialization failed\n"); + if (stat != CUSOLVER_STATUS_SUCCESS) Kokkos::abort("CUSOLVER initialization failed\n"); Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); } diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp index add0a802bd..3b60a0578b 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.cpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -29,39 +29,25 @@ extern "C" { /// Gesv /// -void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, - int*); -void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, - int*, int*); -void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, - std::complex*, int*, int*); -void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, - int*, std::complex*, int*, int*); +void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, int*); +void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, int*, int*); +void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); +void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); /// /// Gesvd /// -void F77_BLAS_MANGLE(sgesvd, SGESVD)(const char*, const char*, const int*, - const int*, float*, const int*, float*, - float*, const int*, float*, const int*, - float*, int*, int*); -void F77_BLAS_MANGLE(dgesvd, DGESVD)(const char*, const char*, const int*, - const int*, double*, const int*, double*, - double*, const int*, double*, const int*, - double*, int*, int*); -void F77_BLAS_MANGLE(cgesvd, CGESVD)(const char*, const char*, const int*, - const int*, std::complex*, - const int*, float*, std::complex*, - const int*, std::complex*, - const int*, std::complex*, int*, - float*, int*); -void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, - const int*, std::complex*, - const int*, double*, std::complex*, - const int*, std::complex*, - const int*, std::complex*, int*, - double*, int*); +void F77_BLAS_MANGLE(sgesvd, SGESVD)(const char*, const char*, const int*, const int*, float*, const int*, float*, + float*, const int*, float*, const int*, float*, int*, int*); +void F77_BLAS_MANGLE(dgesvd, DGESVD)(const char*, const char*, const int*, const int*, double*, const int*, double*, + double*, const int*, double*, const int*, double*, int*, int*); +void F77_BLAS_MANGLE(cgesvd, CGESVD)(const char*, const char*, const int*, const int*, std::complex*, const int*, + float*, std::complex*, const int*, std::complex*, const int*, + std::complex*, int*, float*, int*); +void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, const int*, std::complex*, + const int*, double*, std::complex*, const int*, std::complex*, + const int*, std::complex*, int*, double*, int*); /// /// Trtri @@ -74,14 +60,10 @@ void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, &diag, &n, a, &lda, &info); */ -void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, - const float*, int*, int*); -void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, - const double*, int*, int*); -void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); -void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); +void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, const float*, int*, int*); +void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, const double*, int*, int*); +void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, const std::complex*, int*, int*); +void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, const std::complex*, int*, int*); } #define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) @@ -107,22 +89,17 @@ namespace Impl { /// template <> -void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, - float* b, int ldb, int info) { +void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, float* b, int ldb, int info) { F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack::gesvd(const char jobu, const char jobvt, const int m, - const int n, float* a, const int lda, float* s, - float* u, const int ldu, float* vt, - const int ldvt, float* work, int lwork, +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, const int n, float* a, const int lda, + float* s, float* u, const int ldu, float* vt, const int ldvt, float* work, int lwork, float* /*rwork*/, int info) { - F77_FUNC_SGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, &info); + F77_FUNC_SGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, &info); } template <> -int HostLapack::trtri(const char uplo, const char diag, int n, - const float* a, int lda) { +int HostLapack::trtri(const char uplo, const char diag, int n, const float* a, int lda) { int info = 0; F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); return info; @@ -133,22 +110,17 @@ int HostLapack::trtri(const char uplo, const char diag, int n, /// template <> -void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, - double* b, int ldb, int info) { +void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, double* b, int ldb, int info) { F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack::gesvd(const char jobu, const char jobvt, const int m, - const int n, double* a, const int lda, double* s, - double* u, const int ldu, double* vt, - const int ldvt, double* work, int lwork, +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, const int n, double* a, const int lda, + double* s, double* u, const int ldu, double* vt, const int ldvt, double* work, int lwork, double* /*rwork*/, int info) { - F77_FUNC_DGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, &info); + F77_FUNC_DGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, &info); } template <> -int HostLapack::trtri(const char uplo, const char diag, int n, - const double* a, int lda) { +int HostLapack::trtri(const char uplo, const char diag, int n, const double* a, int lda) { int info = 0; F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); return info; @@ -159,24 +131,19 @@ int HostLapack::trtri(const char uplo, const char diag, int n, /// template <> -void HostLapack >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { +void HostLapack >::gesv(int n, int rhs, std::complex* a, int lda, int* ipiv, + std::complex* b, int ldb, int info) { F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack >::gesvd( - const char jobu, const char jobvt, const int m, const int n, - std::complex* a, const int lda, float* s, std::complex* u, - const int ldu, std::complex* vt, const int ldvt, - std::complex* work, int lwork, float* rwork, int info) { - F77_FUNC_CGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, rwork, &info); +void HostLapack >::gesvd(const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, float* s, std::complex* u, + const int ldu, std::complex* vt, const int ldvt, + std::complex* work, int lwork, float* rwork, int info) { + F77_FUNC_CGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, rwork, &info); } template <> -int HostLapack >::trtri(const char uplo, const char diag, - int n, const std::complex* a, +int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, int lda) { int info = 0; F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); @@ -188,25 +155,20 @@ int HostLapack >::trtri(const char uplo, const char diag, /// template <> -void HostLapack >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { +void HostLapack >::gesv(int n, int rhs, std::complex* a, int lda, int* ipiv, + std::complex* b, int ldb, int info) { F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> -void HostLapack >::gesvd( - const char jobu, const char jobvt, const int m, const int n, - std::complex* a, const int lda, double* s, std::complex* u, - const int ldu, std::complex* vt, const int ldvt, - std::complex* work, int lwork, double* rwork, int info) { - F77_FUNC_ZGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, - &lwork, rwork, &info); +void HostLapack >::gesvd(const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, double* s, + std::complex* u, const int ldu, std::complex* vt, + const int ldvt, std::complex* work, int lwork, double* rwork, + int info) { + F77_FUNC_ZGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, rwork, &info); } template <> -int HostLapack >::trtri(const char uplo, const char diag, - int n, - const std::complex* a, +int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, int lda) { int info = 0; F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); diff --git a/lapack/tpls/KokkosLapack_Host_tpl.hpp b/lapack/tpls/KokkosLapack_Host_tpl.hpp index 9eca83afea..092f9ac9f0 100644 --- a/lapack/tpls/KokkosLapack_Host_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.hpp @@ -30,17 +30,13 @@ namespace Impl { template struct HostLapack { - static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, - int info); + static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, int info); - static void gesvd(const char jobu, const char jobvt, const int m, const int n, - T *A, const int lda, - typename Kokkos::ArithTraits::mag_type *S, T *U, - const int ldu, T *Vt, const int ldvt, T *work, int lwork, - typename Kokkos::ArithTraits::mag_type *rwork, int info); + static void gesvd(const char jobu, const char jobvt, const int m, const int n, T *A, const int lda, + typename Kokkos::ArithTraits::mag_type *S, T *U, const int ldu, T *Vt, const int ldvt, T *work, + int lwork, typename Kokkos::ArithTraits::mag_type *rwork, int info); - static int trtri(const char uplo, const char diag, int n, const T *a, - int lda); + static int trtri(const char uplo, const char diag, int n, const T *a, int lda); }; } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_cusolver.hpp b/lapack/tpls/KokkosLapack_cusolver.hpp index 006fd68b6f..272fb8b3b8 100644 --- a/lapack/tpls/KokkosLapack_cusolver.hpp +++ b/lapack/tpls/KokkosLapack_cusolver.hpp @@ -34,8 +34,7 @@ struct CudaLapackSingleton { static CudaLapackSingleton& singleton(); }; -inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, - const char* name, const char* file, +inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, const char* name, const char* file, const int line) { std::ostringstream out; out << name << " error( "; @@ -48,21 +47,11 @@ inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, out << "CUSOLVER_STATUS_ALLOC_FAILED): you might tried to allocate too " "much memory"; break; - case CUSOLVER_STATUS_INVALID_VALUE: - out << "CUSOLVER_STATUS_INVALID_VALUE)"; - break; - case CUSOLVER_STATUS_ARCH_MISMATCH: - out << "CUSOLVER_STATUS_ARCH_MISMATCH)"; - break; - case CUSOLVER_STATUS_EXECUTION_FAILED: - out << "CUSOLVER_STATUS_EXECUTION_FAILED)"; - break; - case CUSOLVER_STATUS_INTERNAL_ERROR: - out << "CUSOLVER_STATUS_INTERNAL_ERROR)"; - break; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - out << "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED)"; - break; + case CUSOLVER_STATUS_INVALID_VALUE: out << "CUSOLVER_STATUS_INVALID_VALUE)"; break; + case CUSOLVER_STATUS_ARCH_MISMATCH: out << "CUSOLVER_STATUS_ARCH_MISMATCH)"; break; + case CUSOLVER_STATUS_EXECUTION_FAILED: out << "CUSOLVER_STATUS_EXECUTION_FAILED)"; break; + case CUSOLVER_STATUS_INTERNAL_ERROR: out << "CUSOLVER_STATUS_INTERNAL_ERROR)"; break; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: out << "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED)"; break; default: out << "unrecognized error code): this is bad!"; break; } if (file) { @@ -71,10 +60,8 @@ inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, throw std::runtime_error(out.str()); } -inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, - const char* name, - const char* file = nullptr, - const int line = 0) { +inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, const char* name, const char* file = nullptr, + const int line = 0) { if (CUSOLVER_STATUS_SUCCESS != cusolverStatus) { cusolver_internal_error_throw(cusolverStatus, name, file, line); } @@ -82,9 +69,8 @@ inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, // The macro below defines is the public interface for the safe cusolver calls. // The functions themselves are protected by impl namespace. -#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ - KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, \ - __LINE__) +#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, __LINE__) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index 9fbd299ca5..472b79ce85 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -28,27 +28,20 @@ struct gesv_tpl_spec_avail { // Generic Host side LAPACK (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct gesv_tpl_spec_avail< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct gesv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #endif } // namespace Impl } // namespace KokkosLapack @@ -59,29 +52,23 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) } // namespace Impl } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA @@ -91,39 +78,28 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -136,28 +112,21 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index ca4b9e7abc..559f5d0509 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -23,14 +23,12 @@ template inline void gesv_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - printf("KokkosLapack::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", - typeid(AViewType).name(), typeid(BViewType).name(), - typeid(PViewType).name()); + printf("KokkosLapack::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", typeid(AViewType).name(), + typeid(BViewType).name(), typeid(PViewType).name()); #else #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK - printf("KokkosLapack::gesv<> TPL Lapack specialization for < %s , %s, %s >\n", - typeid(AViewType).name(), typeid(BViewType).name(), - typeid(PViewType).name()); + printf("KokkosLapack::gesv<> TPL Lapack specialization for < %s , %s, %s >\n", typeid(AViewType).name(), + typeid(BViewType).name(), typeid(PViewType).name()); #endif #endif #endif @@ -46,8 +44,7 @@ namespace KokkosLapack { namespace Impl { template -void lapackGesvWrapper(const AViewType& A, const BViewType& B, - const IPIVViewType& IPIV) { +void lapackGesvWrapper(const AViewType& A, const BViewType& B, const IPIVViewType& IPIV) { using Scalar = typename AViewType::non_const_value_type; const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); @@ -65,88 +62,65 @@ void lapackGesvWrapper(const AViewType& A, const BViewType& B, if constexpr (Kokkos::ArithTraits::is_complex) { using MagType = typename Kokkos::ArithTraits::mag_type; - HostLapack>::gesv( - N, NRHS, reinterpret_cast*>(A.data()), LDA, - IPIV.data(), reinterpret_cast*>(B.data()), LDB, - info); + HostLapack>::gesv(N, NRHS, reinterpret_cast*>(A.data()), LDA, + IPIV.data(), reinterpret_cast*>(B.data()), LDB, + info); } else { - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), - LDB, info); + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), LDB, info); } } } -#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, EXECSPACE, MEM_SPACE) \ - template <> \ - struct GESV< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const EXECSPACE& /* space */, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR \ - "]"); \ - gesv_print_specialization(); \ - lapackGesvWrapper(A, B, IPIV); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, EXECSPACE, MEM_SPACE) \ + template <> \ + struct GESV< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using BViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + \ + static void gesv(const EXECSPACE& /* space */, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR "]"); \ + gesv_print_specialization(); \ + lapackGesvWrapper(A, B, IPIV); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_OPENMP) -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) #endif #if defined(KOKKOS_ENABLE_THREADS) -KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads, - Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) -KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace) #endif } // namespace Impl @@ -161,12 +135,10 @@ namespace KokkosLapack { namespace Impl { template -void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, - const BViewType& B, const IPIVViewType& IPIV) { +void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, const BViewType& B, const IPIVViewType& IPIV) { using scalar_type = typename AViewType::non_const_value_type; - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + - Kokkos::ArithTraits::name() + "]"); + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + Kokkos::ArithTraits::name() + "]"); gesv_print_specialization(); const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); @@ -178,112 +150,88 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, magma_int_t LDB = (BST == 0) ? 1 : BST; magma_int_t NRHS = static_cast(B.extent(1)); - KokkosLapack::Impl::MagmaSingleton& s = - KokkosLapack::Impl::MagmaSingleton::singleton(); - magma_int_t info = 0; + KokkosLapack::Impl::MagmaSingleton& s = KokkosLapack::Impl::MagmaSingleton::singleton(); + magma_int_t info = 0; space.fence(); if constexpr (std::is_same_v) { if (with_pivot) { - magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), - LDB, &info); + magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_sgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), - LDA, reinterpret_cast(B.data()), - LDB, &info); + magma_sgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } if constexpr (std::is_same_v) { if (with_pivot) { - magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), - LDB, &info); + magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_dgesv_nopiv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_dgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } if constexpr (std::is_same_v>) { if (with_pivot) { - magma_cgesv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), LDB, - &info); + magma_cgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_cgesv_nopiv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_cgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } if constexpr (std::is_same_v>) { if (with_pivot) { - magma_zgesv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - IPIV.data(), reinterpret_cast(B.data()), LDB, - &info); + magma_zgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, IPIV.data(), + reinterpret_cast(B.data()), LDB, &info); } else { - magma_zgesv_nopiv_gpu( - N, NRHS, reinterpret_cast(A.data()), LDA, - reinterpret_cast(B.data()), LDB, &info); + magma_zgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); } } ExecSpace().fence(); Kokkos::Profiling::popRegion(); } -#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - magmaGesvWrapper(space, A, B, IPIV); \ - } \ +#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + magmaGesvWrapper(space, A, B, IPIV); \ + } \ }; KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) } // namespace Impl } // namespace KokkosLapack @@ -296,10 +244,9 @@ KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, - const AViewType& A, const BViewType& B) { +template +void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, const AViewType& A, + const BViewType& B) { using memory_space = typename AViewType::memory_space; using Scalar = typename BViewType::non_const_value_type; using ALayout_t = typename AViewType::array_layout; @@ -307,137 +254,109 @@ void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, const int m = A.extent_int(0); const int n = A.extent_int(1); - const int lda = std::is_same_v ? A.stride(0) - : A.stride(1); + const int lda = std::is_same_v ? A.stride(0) : A.stride(1); (void)B; const int nrhs = B.extent_int(1); - const int ldb = std::is_same_v ? B.stride(0) - : B.stride(1); - int lwork = 0; + const int ldb = std::is_same_v ? B.stride(0) : B.stride(1); + int lwork = 0; Kokkos::View info("getrf info"); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf(s.handle, m, n, A.data(), - lda, Workspace.data(), - IPIV.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, - IPIV.data(), B.data(), ldb, info.data())); + cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf(s.handle, m, n, A.data(), - lda, Workspace.data(), - IPIV.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, - IPIV.data(), B.data(), ldb, info.data())); + cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf_bufferSize( - s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), - lda, reinterpret_cast(Workspace.data()), - IPIV.data(), info.data())); - - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs( - s.handle, CUBLAS_OP_N, m, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), ldb, - info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs(s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf_bufferSize( - s.handle, m, n, reinterpret_cast(A.data()), lda, - &lwork)); - Kokkos::View Workspace("getrf workspace", - lwork); - - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf( - s.handle, m, n, reinterpret_cast(A.data()), lda, - reinterpret_cast(Workspace.data()), IPIV.data(), - info.data())); - - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs( - s.handle, CUBLAS_OP_N, m, nrhs, - reinterpret_cast(A.data()), lda, IPIV.data(), - reinterpret_cast(B.data()), ldb, info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnZgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs(s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); } -#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_CUSOLVER," #SCALAR \ - "]"); \ - gesv_print_specialization(); \ - \ - cusolverGesvWrapper(space, IPIV, A, B); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_CUSOLVER," #SCALAR "]"); \ + gesv_print_specialization(); \ + \ + cusolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -452,103 +371,78 @@ KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, - const AViewType& A, const BViewType& B) { +template +void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, const AViewType& A, + const BViewType& B) { using Scalar = typename BViewType::non_const_value_type; using ALayout_t = typename AViewType::array_layout; using BLayout_t = typename BViewType::array_layout; const rocblas_int N = static_cast(A.extent(0)); const rocblas_int nrhs = static_cast(B.extent(1)); - const rocblas_int lda = std::is_same_v - ? A.stride(0) - : A.stride(1); - const rocblas_int ldb = std::is_same_v - ? B.stride(0) - : B.stride(1); + const rocblas_int lda = std::is_same_v ? A.stride(0) : A.stride(1); + const rocblas_int ldb = std::is_same_v ? B.stride(0) : B.stride(1); Kokkos::View info("rocsolver info"); - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_set_stream(s.handle, space.hip_stream())); + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesv(s.handle, N, nrhs, A.data(), - lda, IPIV.data(), B.data(), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_sgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesv(s.handle, N, nrhs, A.data(), - lda, IPIV.data(), B.data(), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_dgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv( - s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), + lda, IPIV.data(), reinterpret_cast(B.data()), + ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesv( - s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), - ldb, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocsolver_zgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::HIP& space, const AViewType& A, \ - const BViewType& B, const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_ROCSOLVER," #SCALAR "]"); \ - gesv_print_specialization(); \ - \ - rocsolverGesvWrapper(space, IPIV, A, B); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::HIP& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_ROCSOLVER," #SCALAR "]"); \ + gesv_print_specialization(); \ + \ + rocsolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosLapack diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp index 7a7403209f..cc1ad12b96 100644 --- a/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp @@ -20,148 +20,104 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct svd_tpl_spec_avail { enum : bool { value = false }; }; // LAPACK -#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_MKL) -#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, EXECSPACE) \ - template <> \ - struct svd_tpl_spec_avail< \ - EXECSPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #if defined(KOKKOS_ENABLE_SERIAL) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #if defined(KOKKOS_ENABLE_OPENMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #if defined(KOKKOS_ENABLE_THREADS) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Threads) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) #endif #endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK || KOKKOSKERNELS_ENABLE_TPL_MKL // CUSOLVER #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER -#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct svd_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif // CUDAUVMSPACE #endif // CUSOLVER // ROCSOLVER #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER -#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct svd_tpl_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) #endif // HIPMANAGEDSPACE #endif // ROCSOLVER diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp index 4385fa40d6..01255bf427 100644 --- a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp @@ -22,8 +22,7 @@ namespace KokkosLapack { namespace Impl { -template +template inline void svd_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER @@ -31,8 +30,7 @@ inline void svd_print_specialization() { printf( "KokkosLapack::svd<> TPL Cusolver specialization for < %s , %s, %s, %s " ">\n", - typeid(AMatrix).name(), typeid(SVector).name(), typeid(UMatrix).name(), - typeid(VMatrix).name()); + typeid(AMatrix).name(), typeid(SVector).name(), typeid(UMatrix).name(), typeid(VMatrix).name()); } #endif #endif @@ -41,18 +39,15 @@ inline void svd_print_specialization() { } // namespace KokkosLapack // LAPACK -#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) #include "KokkosLapack_Host_tpl.hpp" namespace KokkosLapack { namespace Impl { -template -void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -74,128 +69,96 @@ void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const int ldvt = Vt.stride(1); int lwork = -1, info = 0; - Kokkos::View rwork("svd rwork buffer", - 5 * Kokkos::min(m, n)); + Kokkos::View rwork("svd rwork buffer", 5 * Kokkos::min(m, n)); Kokkos::View work("svd work buffer", 1); if constexpr (Kokkos::ArithTraits::is_complex) { HostLapack>::gesvd( - jobu[0], jobvt[0], m, n, - reinterpret_cast*>(A.data()), lda, S.data(), + jobu[0], jobvt[0], m, n, reinterpret_cast*>(A.data()), lda, S.data(), reinterpret_cast*>(U.data()), ldu, reinterpret_cast*>(Vt.data()), ldvt, - reinterpret_cast*>(work.data()), lwork, - rwork.data(), info); + reinterpret_cast*>(work.data()), lwork, rwork.data(), info); lwork = static_cast(work(0).real()); work = Kokkos::View("svd work buffer", lwork); HostLapack>::gesvd( - jobu[0], jobvt[0], m, n, - reinterpret_cast*>(A.data()), lda, S.data(), + jobu[0], jobvt[0], m, n, reinterpret_cast*>(A.data()), lda, S.data(), reinterpret_cast*>(U.data()), ldu, reinterpret_cast*>(Vt.data()), ldvt, - reinterpret_cast*>(work.data()), lwork, - rwork.data(), info); + reinterpret_cast*>(work.data()), lwork, rwork.data(), info); } else { - HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), - lwork, rwork.data(), info); + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), ldvt, + work.data(), lwork, rwork.data(), info); lwork = static_cast(work(0)); work = Kokkos::View("svd work buffer", lwork); - HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), - lwork, rwork.data(), info); + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), ldvt, + work.data(), lwork, rwork.data(), info); } } -#define KOKKOSLAPACK_SVD_LAPACK(SCALAR, LAYOUT, EXEC_SPACE) \ - template <> \ - struct SVD< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const EXEC_SPACE& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - lapackSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_LAPACK(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + lapackSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #if defined(KOKKOS_ENABLE_OPENMP) KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #if defined(KOKKOS_ENABLE_THREADS) KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads) KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) #endif } // namespace Impl @@ -208,11 +171,9 @@ KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -233,33 +194,25 @@ void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], const lapack_int ldu = U.stride(1); const lapack_int ldvt = Vt.stride(1); - Kokkos::View rwork("svd rwork buffer", - Kokkos::min(m, n) - 1); + Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); lapack_int ret = 0; if constexpr (std::is_same_v) { - ret = - LAPACKE_sgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, - S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + ret = LAPACKE_sgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), + ldvt, rwork.data()); } if constexpr (std::is_same_v) { - ret = - LAPACKE_dgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, - S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + ret = LAPACKE_dgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), ldu, Vt.data(), + ldvt, rwork.data()); } if constexpr (std::is_same_v>) { - ret = LAPACKE_cgesvd( - LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, rwork.data()); + ret = LAPACKE_cgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), + lda, S.data(), reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data()); } if constexpr (std::is_same_v>) { - ret = LAPACKE_zgesvd( - LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, - rwork.data()); + ret = LAPACKE_zgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), + lda, S.data(), reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data()); } if (ret != 0) { @@ -269,90 +222,67 @@ void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], } } -#define KOKKOSLAPACK_SVD_MKL(SCALAR, LAYOUT, EXEC_SPACE) \ - template <> \ - struct SVD< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const EXEC_SPACE& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - mklSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_MKL(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + mklSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #if defined(KOKKOS_ENABLE_SERIAL) KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Serial) KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) #endif #if defined(KOKKOS_ENABLE_OPENMP) KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::OpenMP) KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) #endif #if defined(KOKKOS_ENABLE_THREADS) KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Threads) KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Threads) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) -KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Threads) #endif } // namespace Impl @@ -366,11 +296,9 @@ KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -393,128 +321,98 @@ void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], int lwork = 0; Kokkos::View info("svd info"); - Kokkos::View rwork("svd rwork buffer", - Kokkos::min(m, n) - 1); + Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd( - s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), + info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd( - s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), + info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, - reinterpret_cast(work.data()), lwork, - rwork.data(), info.data())); + cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, + reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( - cusolverDnZgesvd(s.handle, jobu[0], jobvt[0], m, n, - reinterpret_cast(A.data()), lda, - S.data(), reinterpret_cast(U.data()), - ldu, reinterpret_cast(Vt.data()), - ldvt, reinterpret_cast(work.data()), - lwork, rwork.data(), info.data())); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd( + s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, + reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); } -#define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct SVD< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const Kokkos::Cuda& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_CUSOLVER," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - cusolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::Cuda& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_CUSOLVER," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + cusolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif } // namespace Impl @@ -529,11 +427,9 @@ KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, namespace KokkosLapack { namespace Impl { -template -void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], - const char jobvt[], const AMatrix& A, const SVector& S, - const UMatrix& U, const VMatrix& Vt) { +template +void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { using memory_space = typename AMatrix::memory_space; using Scalar = typename AMatrix::non_const_value_type; using Magnitude = typename SVector::non_const_value_type; @@ -574,111 +470,84 @@ void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const rocblas_workmode WorkMode = rocblas_outofplace; Kokkos::View info("svd info"); - Kokkos::View rwork("svd rwork buffer", - Kokkos::min(m, n) - 1); + Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); - KokkosBlas::Impl::RocBlasSingleton& s = - KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( - rocblas_set_stream(s.handle, space.hip_stream())); + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd( - s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd( - s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), - ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v>) { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesvd( - s.handle, UVecMode, VVecMode, m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, rwork.data(), - WorkMode, info.data())); + s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), + ldvt, rwork.data(), WorkMode, info.data())); } if constexpr (std::is_same_v>) { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesvd( - s.handle, UVecMode, VVecMode, m, n, - reinterpret_cast(A.data()), lda, S.data(), - reinterpret_cast(U.data()), ldu, - reinterpret_cast(Vt.data()), ldvt, - rwork.data(), WorkMode, info.data())); + s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), + ldvt, rwork.data(), WorkMode, info.data())); } KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); } -#define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct SVD< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - svd_eti_spec_avail< \ - Kokkos::HIP, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using SVector = \ - Kokkos::View::mag_type*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using UMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using VMatrix = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void svd(const Kokkos::HIP& space, const char jobu[], \ - const char jobvt[], const AMatrix& A, const SVector& S, \ - const UMatrix& U, const VMatrix& Vt) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_ROCSOLVER," #SCALAR \ - "]"); \ - svd_print_specialization(); \ - \ - rocsolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD< \ + Kokkos::HIP, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::HIP& space, const char jobu[], const char jobvt[], const AMatrix& A, \ + const SVector& S, const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_ROCSOLVER," #SCALAR "]"); \ + svd_print_specialization(); \ + \ + rocsolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) #if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) -KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) #endif } // namespace Impl diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp index 7251d97086..6ec8d26a98 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp @@ -27,14 +27,13 @@ struct trtri_tpl_spec_avail { }; // Generic Host side LAPACK (could be MKL or whatever) -#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ - template \ - struct trtri_tpl_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ + template \ + struct trtri_tpl_spec_avail< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK @@ -51,80 +50,52 @@ struct trtri_tpl_spec_avail { #define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, - Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #ifdef KOKKOS_ENABLE_CUDA -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif } // namespace Impl diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index b7e9c6e341..b326e722a0 100644 --- a/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -27,112 +27,86 @@ namespace KokkosLapack { namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRTRI >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ - RViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void trtri(const RViewType& R, const char uplo[], \ - const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ - const int M = static_cast(A.extent(0)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - \ - const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - \ - char uplo_; \ - \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = A_is_layout_left ? 'L' : 'U'; \ - else \ - uplo_ = A_is_layout_left ? 'U' : 'L'; \ - \ - R() = HostLapack::trtri( \ - uplo_, diag[0], M, \ - reinterpret_cast(A.data()), LDA); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRTRI >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View > \ + RViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void trtri(const RViewType& R, const char uplo[], const char diag[], const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ + const int M = static_cast(A.extent(0)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + \ + const int AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + \ + char uplo_; \ + \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = A_is_layout_left ? 'L' : 'U'; \ + else \ + uplo_ = A_is_layout_left ? 'U' : 'L'; \ + \ + R() = HostLapack::trtri(uplo_, diag[0], M, \ + reinterpret_cast(A.data()), LDA); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #else -#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ - MAGMA_FN, LAYOUTA, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct TRTRI >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ - RViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void trtri(const RViewType& R, const char uplo[], \ - const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ - magma_int_t M = static_cast(A.extent(0)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - \ - magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - magma_int_t info = 0; \ - magma_uplo_t uplo_; \ - magma_diag_t diag_; \ - \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ - else \ - uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ - \ - if (diag[0] == 'U' || diag[0] == 'u') \ - diag_ = MagmaUnit; \ - else \ - diag_ = MagmaNonUnit; \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - R() = MAGMA_FN(uplo_, diag_, M, \ - reinterpret_cast( \ - const_cast(A.data())), \ - LDA, &info); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRTRI >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View > \ + RViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void trtri(const RViewType& R, const char uplo[], const char diag[], const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ + magma_int_t M = static_cast(A.extent(0)); \ + \ + bool A_is_layout_left = std::is_same::value; \ + \ + magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + magma_int_t info = 0; \ + magma_uplo_t uplo_; \ + magma_diag_t diag_; \ + \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ + else \ + uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ + \ + if (diag[0] == 'U' || diag[0] == 'u') \ + diag_ = MagmaUnit; \ + else \ + diag_ = MagmaNonUnit; \ + \ + KokkosLapack::Impl::MagmaSingleton& s = KokkosLapack::Impl::MagmaSingleton::singleton(); \ + R() = MAGMA_FN(uplo_, diag_, M, reinterpret_cast(const_cast(A.data())), LDA, \ + &info); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #else -#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ - MAGMA_FN, LAYOUTA, MEM_SPACE, \ - ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA // Explicitly define the TRTRI class for all permutations listed below @@ -140,63 +114,50 @@ namespace Impl { // Handle type and space permutations #ifdef KOKKOS_ENABLE_CUDA -#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, \ +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, \ +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ - std::complex, LAYOUTA, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ - magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ - Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) - -#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ - magmaFloatComplex_ptr, magma_ctrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ - Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, LAYOUTA, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, LAYOUTA, \ + Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, LAYOUTA, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, LAYOUTA, \ + Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) #else -#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ - std::complex, LAYOUTA, \ - Kokkos::HostSpace, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) -#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) #endif diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 77774d1d3f..653ed2cbf2 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -16,14 +16,11 @@ // only enable this test where KokkosLapack supports gesv: // CUDA+(MAGMA or CUSOLVER), HIP+ROCSOLVER and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ - (defined(TEST_HIP_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ - defined(TEST_THREADS_LAPACK_CPP))) +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ + (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include #include @@ -66,12 +63,8 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror(B); // Initialize data. - Kokkos::fill_random( - A, rand_pool, - Kokkos::rand, ScalarA>::max()); - Kokkos::fill_random( - X0, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(X0, rand_pool, Kokkos::rand, ScalarA>::max()); // Generate RHS B = A*X0. ScalarA alpha = 1.0; @@ -84,9 +77,8 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - using ViewTypeP = typename std::conditional< - MAGMA, Kokkos::View, - Kokkos::View>::type; + using ViewTypeP = typename std::conditional, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -105,8 +97,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL - nopivot_runtime_err = (!std::is_same::value) && + nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -138,8 +129,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { printf( " Error %d, pivot %c, padding %c: result( %.15lf ) !=" "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", - N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), - ats::abs(h_B(i) - h_X0(i)), eps); + N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), ats::abs(h_B(i) - h_X0(i)), eps); break; } } @@ -147,8 +137,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { } template -void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, - int nrhs) { +void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) { using execution_space = typename Device::execution_space; using ScalarA = typename ViewTypeA::value_type; using ats = Kokkos::ArithTraits; @@ -177,12 +166,8 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror(B); // Initialize data. - Kokkos::fill_random( - A, rand_pool, - Kokkos::rand, ScalarA>::max()); - Kokkos::fill_random( - X0, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(X0, rand_pool, Kokkos::rand, ScalarA>::max()); // Generate RHS B = A*X0. ScalarA alpha = 1.0; @@ -195,9 +180,8 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - using ViewTypeP = typename std::conditional< - MAGMA, Kokkos::View, - Kokkos::View>::type; + using ViewTypeP = typename std::conditional, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -216,8 +200,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL - nopivot_runtime_err = (!std::is_same::value) && + nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -263,49 +246,32 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, template int test_gesv(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ - (defined(TEST_HIP_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ - defined(TEST_THREADS_LAPACK_CPP))) - Test::impl_test_gesv( - &mode[0], "N", 2); // no padding - Test::impl_test_gesv( - &mode[0], "N", 13); // no padding - Test::impl_test_gesv( - &mode[0], "N", 179); // no padding - Test::impl_test_gesv( - &mode[0], "N", 64); // no padding - Test::impl_test_gesv( - &mode[0], "N", 1024); // no padding +#if (defined(TEST_CUDA_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) + Test::impl_test_gesv(&mode[0], "N", 2); // no padding + Test::impl_test_gesv(&mode[0], "N", 13); // no padding + Test::impl_test_gesv(&mode[0], "N", 179); // no padding + Test::impl_test_gesv(&mode[0], "N", 64); // no padding + Test::impl_test_gesv(&mode[0], "N", 1024); // no padding #elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) - if constexpr (std::is_same_v) { - Test::impl_test_gesv( - &mode[0], "N", 2); // no padding - Test::impl_test_gesv( - &mode[0], "N", 13); // no padding - Test::impl_test_gesv( - &mode[0], "N", 179); // no padding - Test::impl_test_gesv( - &mode[0], "N", 64); // no padding - Test::impl_test_gesv( - &mode[0], "N", 1024); // no padding - - Test::impl_test_gesv( - &mode[0], "Y", - 13); // padding - Test::impl_test_gesv( - &mode[0], "Y", - 179); // padding + if constexpr (std::is_same_v) { + Test::impl_test_gesv(&mode[0], "N", 2); // no padding + Test::impl_test_gesv(&mode[0], "N", 13); // no padding + Test::impl_test_gesv(&mode[0], "N", 179); // no padding + Test::impl_test_gesv(&mode[0], "N", 64); // no padding + Test::impl_test_gesv(&mode[0], "N", 1024); // no padding + + Test::impl_test_gesv(&mode[0], "Y", + 13); // padding + Test::impl_test_gesv(&mode[0], "Y", + 179); // padding } #endif #endif @@ -318,48 +284,31 @@ int test_gesv(const char* mode) { template int test_gesv_mrhs(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) using view_type_a_ll = Kokkos::View; using view_type_b_ll = Kokkos::View; -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ - (defined(TEST_HIP_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ - defined(TEST_THREADS_LAPACK_CPP))) - Test::impl_test_gesv_mrhs( - &mode[0], "N", 2, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 13, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 179, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 64, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 1024, 5); // no padding +#if (defined(TEST_CUDA_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) + Test::impl_test_gesv_mrhs(&mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding // When appropriate run MAGMA specific tests #elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) - if constexpr (std::is_same_v) { - Test::impl_test_gesv_mrhs( - &mode[0], "N", 2, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 13, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 179, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 64, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "N", 1024, 5); // no padding - - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 13, 5); // padding - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 179, 5); // padding + if constexpr (std::is_same_v) { + Test::impl_test_gesv_mrhs(&mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding + + Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5); // padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 179, 5); // padding } #endif #endif @@ -370,8 +319,7 @@ int test_gesv_mrhs(const char* mode) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_float"); test_gesv("N"); // No pivoting @@ -388,8 +336,7 @@ TEST_F(TestCategory, gesv_mrhs_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_double"); test_gesv("N"); // No pivoting @@ -406,8 +353,7 @@ TEST_F(TestCategory, gesv_mrhs_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_double"); test_gesv, TestDevice>("N"); // No pivoting @@ -424,8 +370,7 @@ TEST_F(TestCategory, gesv_mrhs_complex_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_float"); test_gesv, TestDevice>("N"); // No pivoting diff --git a/lapack/unit_test/Test_Lapack_svd.hpp b/lapack/unit_test/Test_Lapack_svd.hpp index da9f9ba480..a47dbbe9b9 100644 --- a/lapack/unit_test/Test_Lapack_svd.hpp +++ b/lapack/unit_test/Test_Lapack_svd.hpp @@ -26,10 +26,8 @@ namespace Test { template -void check_triple_product( - const AMatrix& A, const SVector& S, const UMatrix& U, const VMatrix& Vt, - typename Kokkos::ArithTraits< - typename AMatrix::non_const_value_type>::mag_type tol) { +void check_triple_product(const AMatrix& A, const SVector& S, const UMatrix& U, const VMatrix& Vt, + typename Kokkos::ArithTraits::mag_type tol) { // After a successful SVD decomposition we have A=U*S*V // So using gemm we should be able to compare the above // triple product to the original matrix A. @@ -40,8 +38,7 @@ void check_triple_product( // First compute the left side of the product: temp = U*S Kokkos::parallel_for( - Kokkos::RangePolicy(0, U.extent_int(0)), - KOKKOS_LAMBDA(const int& rowIdx) { + Kokkos::RangePolicy(0, U.extent_int(0)), KOKKOS_LAMBDA(const int& rowIdx) { for (int colIdx = 0; colIdx < U.extent_int(1); ++colIdx) { if (colIdx < S.extent_int(0)) { temp(rowIdx, colIdx) = U(rowIdx, colIdx) * S(colIdx); @@ -69,8 +66,7 @@ void check_triple_product( template void check_unitary_orthogonal_matrix( - const Matrix& M, typename Kokkos::ArithTraits< - typename Matrix::non_const_value_type>::mag_type tol) { + const Matrix& M, typename Kokkos::ArithTraits::mag_type tol) { // After a successful SVD decomposition the matrices // U and V are unitary matrices. Thus we can check // the property UUt=UtU=I and VVt=VtV=I using gemm. @@ -83,11 +79,9 @@ void check_unitary_orthogonal_matrix( for (int rowIdx = 0; rowIdx < M.extent_int(0); ++rowIdx) { for (int colIdx = 0; colIdx < M.extent_int(0); ++colIdx) { if (rowIdx == colIdx) { - EXPECT_NEAR_KK_REL(I0_h(rowIdx, colIdx), - Kokkos::ArithTraits::one(), tol); + EXPECT_NEAR_KK_REL(I0_h(rowIdx, colIdx), Kokkos::ArithTraits::one(), tol); } else { - EXPECT_NEAR_KK(I0_h(rowIdx, colIdx), - Kokkos::ArithTraits::zero(), tol); + EXPECT_NEAR_KK(I0_h(rowIdx, colIdx), Kokkos::ArithTraits::zero(), tol); } } } @@ -99,11 +93,9 @@ void check_unitary_orthogonal_matrix( for (int rowIdx = 0; rowIdx < M.extent_int(1); ++rowIdx) { for (int colIdx = 0; colIdx < M.extent_int(1); ++colIdx) { if (rowIdx == colIdx) { - EXPECT_NEAR_KK_REL(I1_h(rowIdx, colIdx), - Kokkos::ArithTraits::one(), tol); + EXPECT_NEAR_KK_REL(I1_h(rowIdx, colIdx), Kokkos::ArithTraits::one(), tol); } else { - EXPECT_NEAR_KK(I1_h(rowIdx, colIdx), - Kokkos::ArithTraits::zero(), tol); + EXPECT_NEAR_KK(I1_h(rowIdx, colIdx), Kokkos::ArithTraits::zero(), tol); } } } @@ -113,9 +105,8 @@ template int impl_analytic_2x2_svd() { using scalar_type = typename AMatrix::value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; - using vector_type = - Kokkos::View; - using KAT_S = Kokkos::ArithTraits; + using vector_type = Kokkos::View; + using KAT_S = Kokkos::ArithTraits; const mag_type eps = KAT_S::eps(); @@ -147,8 +138,7 @@ int impl_analytic_2x2_svd() { // The singular values for this problem // are known: sqrt(45) and sqrt(5) - EXPECT_NEAR_KK_REL(S_h(0), static_cast(Kokkos::sqrt(45)), - 100 * eps); + EXPECT_NEAR_KK_REL(S_h(0), static_cast(Kokkos::sqrt(45)), 100 * eps); EXPECT_NEAR_KK_REL(S_h(1), static_cast(Kokkos::sqrt(5)), 100 * eps); // The singular vectors should be identical @@ -156,21 +146,16 @@ int impl_analytic_2x2_svd() { // component of the vectors to determine // the proper signed comparison. std::vector Uref = { - static_cast(1 / Kokkos::sqrt(10)), - static_cast(3 / Kokkos::sqrt(10)), - static_cast(-3 / Kokkos::sqrt(10)), - static_cast(1 / Kokkos::sqrt(10))}; + static_cast(1 / Kokkos::sqrt(10)), static_cast(3 / Kokkos::sqrt(10)), + static_cast(-3 / Kokkos::sqrt(10)), static_cast(1 / Kokkos::sqrt(10))}; std::vector Vtref = { - static_cast(1 / Kokkos::sqrt(2)), - static_cast(-1 / Kokkos::sqrt(2)), - static_cast(1 / Kokkos::sqrt(2)), - static_cast(1 / Kokkos::sqrt(2))}; + static_cast(1 / Kokkos::sqrt(2)), static_cast(-1 / Kokkos::sqrt(2)), + static_cast(1 / Kokkos::sqrt(2)), static_cast(1 / Kokkos::sqrt(2))}; // Both rotations and reflections are valid // vector basis so we need to check both signs // to confirm proper SVD was achieved. - Kokkos::View U_real("U real", 2, 2), - Vt_real("Vt real", 2, 2); + Kokkos::View U_real("U real", 2, 2), Vt_real("Vt real", 2, 2); if constexpr (KAT_S::is_complex) { U_real(0, 0) = U_h(0, 0).real(); U_real(0, 1) = U_h(0, 1).real(); @@ -219,9 +204,8 @@ template int impl_analytic_2x3_svd() { using scalar_type = typename AMatrix::value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; - using vector_type = - Kokkos::View; - using KAT_S = Kokkos::ArithTraits; + using vector_type = Kokkos::View; + using KAT_S = Kokkos::ArithTraits; const mag_type tol = 100 * KAT_S::eps(); @@ -277,8 +261,7 @@ int impl_analytic_2x3_svd() { // Both rotations and reflections are valid // vector basis so we need to check both signs // to confirm proper SVD was achieved. - Kokkos::View U_real("U real", 2, 2), - Vt_real("Vt real", 3, 3); + Kokkos::View U_real("U real", 2, 2), Vt_real("Vt real", 3, 3); if constexpr (KAT_S::is_complex) { U_real(0, 0) = U_h(0, 0).real(); U_real(0, 1) = U_h(0, 1).real(); @@ -350,9 +333,8 @@ template int impl_analytic_3x2_svd() { using scalar_type = typename AMatrix::value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; - using vector_type = - Kokkos::View; - using KAT_S = Kokkos::ArithTraits; + using vector_type = Kokkos::View; + using KAT_S = Kokkos::ArithTraits; const mag_type tol = 100 * KAT_S::eps(); @@ -396,8 +378,7 @@ int impl_analytic_3x2_svd() { // Both rotations and reflections are valid // vector basis so we need to check both signs // to confirm proper SVD was achieved. - Kokkos::View U_real("U real", 3, 3), - Vt_real("Vt real", 2, 2); + Kokkos::View U_real("U real", 3, 3), Vt_real("Vt real", 2, 2); if constexpr (KAT_S::is_complex) { U_real(0, 0) = U_h(0, 0).real(); U_real(0, 1) = U_h(0, 1).real(); @@ -471,8 +452,7 @@ int impl_test_svd(const int m, const int n) { using scalar_type = typename AMatrix::value_type; using KAT_S = Kokkos::ArithTraits; using mag_type = typename KAT_S::mag_type; - using vector_type = - Kokkos::View; + using vector_type = Kokkos::View; const mag_type max_val = 10; const mag_type tol = 2000 * max_val * KAT_S::eps(); @@ -480,8 +460,7 @@ int impl_test_svd(const int m, const int n) { AMatrix A("A", m, n), U("U", m, m), Vt("Vt", n, n), Aref("A ref", m, n); vector_type S("S", Kokkos::min(m, n)); - const uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); // Initialize A with random numbers @@ -492,8 +471,7 @@ int impl_test_svd(const int m, const int n) { // Working around CUSOLVER constraint for m >= n #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { if (m >= n) { KokkosLapack::svd("A", "A", A, S, U, Vt); } else { @@ -523,10 +501,8 @@ int test_svd() { int ret; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_left = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_left = Kokkos::View; ret = Test::impl_analytic_2x2_svd(); EXPECT_EQ(ret, 0); @@ -554,10 +530,8 @@ int test_svd() { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_right = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_right = Kokkos::View; ret = Test::impl_analytic_2x2_svd(); EXPECT_EQ(ret, 0); @@ -589,18 +563,15 @@ int test_svd() { template int test_svd_wrapper() { -#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_MKL) - if constexpr (std::is_same_v) { +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + if constexpr (std::is_same_v) { // Using a device side space with LAPACK/MKL return test_svd(); } #endif #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { // Using a Cuda device with CUSOLVER return test_svd(); } @@ -618,8 +589,7 @@ int test_svd_wrapper() { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_float"); test_svd_wrapper(); @@ -628,8 +598,7 @@ TEST_F(TestCategory, svd_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_double"); test_svd_wrapper(); @@ -638,8 +607,7 @@ TEST_F(TestCategory, svd_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_float"); test_svd_wrapper, TestDevice>(); @@ -648,8 +616,7 @@ TEST_F(TestCategory, svd_complex_float) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, svd_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_double"); test_svd_wrapper, TestDevice>(); diff --git a/lapack/unit_test/Test_Lapack_trtri.hpp b/lapack/unit_test/Test_Lapack_trtri.hpp index a19e575d89..b555ea8aaf 100644 --- a/lapack/unit_test/Test_Lapack_trtri.hpp +++ b/lapack/unit_test/Test_Lapack_trtri.hpp @@ -43,8 +43,7 @@ struct NonUnitDiagTRTRI { KOKKOS_INLINE_FUNCTION void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; } }; -template +template struct VanillaGEMM { bool A_t, B_t, A_c, B_c; int N, K; @@ -61,12 +60,9 @@ struct VanillaGEMM { ScalarC beta; KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type& team) - const { + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \ - !defined(__HIP_DEVICE_COMPILE__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); @@ -97,8 +93,7 @@ struct VanillaGEMM { }; template -int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, - const int M, const int N) { +int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, const int M, const int N) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; using APT = Kokkos::ArithTraits; @@ -111,9 +106,8 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, ViewTypeA A("A", M, N); ViewTypeA A_original("A_original", M, N); ViewTypeA A_I("A_I", M, N); // is I taken...? - uint64_t seed = - std::chrono::high_resolution_clock::now().time_since_epoch().count(); - ScalarA beta = ScalarA(0); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); ScalarA cur_check_val; // Either 1 or 0, to check A_I // const int As0 = A.stride(0), As1 = A.stride(1); @@ -137,8 +131,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, } } // Set just 1 value in the diagonal to 0. - if (M > 0 && N > 0) - host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0); + if (M > 0 && N > 0) host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0); Kokkos::deep_copy(A, host_A); } return KokkosLapack::trtri(uplo, diag, A); @@ -151,21 +144,17 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, Kokkos::Random_XorShift64_Pool rand_pool(seed); // Initialize A with deterministic random numbers - Kokkos::fill_random( - A, rand_pool, - Kokkos::rand, ScalarA>::max()); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, ScalarA>::max()); if ((diag[0] == 'U') || (diag[0] == 'u')) { using functor_type = UnitDiagTRTRI; functor_type udtrtri(A); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosLapack::Test::UnitDiagTRTRI", - Kokkos::RangePolicy(0, M), udtrtri); + Kokkos::parallel_for("KokkosLapack::Test::UnitDiagTRTRI", Kokkos::RangePolicy(0, M), udtrtri); } else { //(diag[0]=='N')||(diag[0]=='n') using functor_type = NonUnitDiagTRTRI; functor_type nudtrtri(A); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosLapack::Test::NonUnitDiagTRTRI", - Kokkos::RangePolicy(0, M), nudtrtri); + Kokkos::parallel_for("KokkosLapack::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy(0, M), nudtrtri); } Kokkos::fence(); Kokkos::deep_copy(host_A, A); @@ -199,8 +188,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, Kokkos::fence(); if (ret) { - printf("KokkosLapack::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], - typeid(ViewTypeA).name(), ret); + printf("KokkosLapack::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], typeid(ViewTypeA).name(), ret); return ret; } @@ -228,12 +216,10 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, vgemm.C = A_I; // out vgemm.alpha = ScalarA(1); vgemm.beta = beta; - Kokkos::parallel_for( - "KokkosLapack::Test::VanillaGEMM", - Kokkos::TeamPolicy( - M, Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - vgemm); + Kokkos::parallel_for("KokkosLapack::Test::VanillaGEMM", + Kokkos::TeamPolicy( + M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), + vgemm); Kokkos::fence(); Kokkos::deep_copy(host_I, A_I); @@ -251,8 +237,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { // Set check value - cur_check_val = - (i == j) ? ScalarA(1) : ScalarA(0); // APT::abs(host_A(i,j)); + cur_check_val = (i == j) ? ScalarA(1) : ScalarA(0); // APT::abs(host_A(i,j)); // Check how close |A_I - cur_check_val| is to 0. if (APT::abs(APT::abs(host_I(i, j)) - cur_check_val) > eps) { @@ -276,38 +261,30 @@ int test_trtri(const char* mode) { int ret; int bad_diag_idx = -1; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_left = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_left = Kokkos::View; - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 0, 0); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 0, 0); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 1, 1); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 1, 1); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 15, 15); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 15, 15); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 100, 100); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 100, 100); EXPECT_EQ(ret, 0); // Rounding errors with randomly generated matrices begin here where M>100, so // we pass in A=I - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 273, 273); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 273, 273); EXPECT_EQ(ret, 0); // Only non-unit matrices could be singular. if (mode[1] == 'N' || mode[1] == 'n') { bad_diag_idx = 2; // 1-index based - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 2, 2); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 2, 2); EXPECT_EQ(ret, bad_diag_idx); bad_diag_idx = -1; } @@ -318,38 +295,30 @@ int test_trtri(const char* mode) { #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - using view_type_a_layout_right = - Kokkos::View; + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_right = Kokkos::View; - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 0, 0); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 0, 0); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 1, 1); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 1, 1); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 15, 15); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 15, 15); EXPECT_EQ(ret, 0); - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 100, 100); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 100, 100); EXPECT_EQ(ret, 0); // Rounding errors with randomly generated matrices begin here where M>100, so // we pass in A=I - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 273, 273); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 273, 273); EXPECT_EQ(ret, 0); // Only non-unit matrices could be singular. if (mode[1] == 'N' || mode[1] == 'n') { bad_diag_idx = 2; // 1-index based - ret = Test::impl_test_trtri( - bad_diag_idx, &mode[0], &mode[1], 2, 2); + ret = Test::impl_test_trtri(bad_diag_idx, &mode[0], &mode[1], 2, 2); EXPECT_EQ(ret, bad_diag_idx); bad_diag_idx = -1; } @@ -359,8 +328,7 @@ int test_trtri(const char* mode) { } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_float"); test_trtri("UN"); @@ -372,8 +340,7 @@ TEST_F(TestCategory, trtri_float) { #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_double"); test_trtri("UN"); @@ -385,8 +352,7 @@ TEST_F(TestCategory, trtri_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_double) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_double"); test_trtri, TestDevice>("UN"); @@ -398,8 +364,7 @@ TEST_F(TestCategory, trtri_complex_double) { #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_float) { Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_float"); test_trtri, TestDevice>("UN"); diff --git a/ode/impl/KokkosODE_BDF_impl.hpp b/ode/impl/KokkosODE_BDF_impl.hpp index cf89731f1b..3119ff0e3a 100644 --- a/ode/impl/KokkosODE_BDF_impl.hpp +++ b/ode/impl/KokkosODE_BDF_impl.hpp @@ -44,31 +44,27 @@ struct BDF_table<2> { template <> struct BDF_table<3> { static constexpr int order = 3; - Kokkos::Array coefficients{ - {-18.0 / 11.0, 9.0 / 11.0, -2.0 / 11.0, 6.0 / 11.0}}; + Kokkos::Array coefficients{{-18.0 / 11.0, 9.0 / 11.0, -2.0 / 11.0, 6.0 / 11.0}}; }; template <> struct BDF_table<4> { static constexpr int order = 4; - Kokkos::Array coefficients{ - {-48.0 / 25.0, 36.0 / 25.0, -16.0 / 25.0, 3.0 / 25.0, 12.0 / 25.0}}; + Kokkos::Array coefficients{{-48.0 / 25.0, 36.0 / 25.0, -16.0 / 25.0, 3.0 / 25.0, 12.0 / 25.0}}; }; template <> struct BDF_table<5> { static constexpr int order = 5; - Kokkos::Array coefficients{{-300.0 / 137.0, 300.0 / 137.0, - -200.0 / 137.0, 75.0 / 137.0, - -12.0 / 137.0, 60.0 / 137.0}}; + Kokkos::Array coefficients{ + {-300.0 / 137.0, 300.0 / 137.0, -200.0 / 137.0, 75.0 / 137.0, -12.0 / 137.0, 60.0 / 137.0}}; }; template <> struct BDF_table<6> { static constexpr int order = 6; Kokkos::Array coefficients{ - {-360.0 / 147.0, 450.0 / 147.0, -400.0 / 147.0, 225.0 / 147.0, - -72.0 / 147.0, 10.0 / 147.0, 60.0 / 147.0}}; + {-360.0 / 147.0, 450.0 / 147.0, -400.0 / 147.0, 225.0 / 147.0, -72.0 / 147.0, 10.0 / 147.0, 60.0 / 147.0}}; }; template @@ -82,14 +78,9 @@ struct BDF_system_wrapper { mv_type yn; KOKKOS_FUNCTION - BDF_system_wrapper(const system_type& mySys_, const table_type& table_, - const double t_, const double dt_, const mv_type& yn_) - : mySys(mySys_), - neqs(mySys_.neqs), - table(table_), - t(t_), - dt(dt_), - yn(yn_) {} + BDF_system_wrapper(const system_type& mySys_, const table_type& table_, const double t_, const double dt_, + const mv_type& yn_) + : mySys(mySys_), neqs(mySys_.neqs), table(table_), t(t_), dt(dt_), yn(yn_) {} template KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { @@ -99,8 +90,7 @@ struct BDF_system_wrapper { for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { f(eqIdx) = y(eqIdx) - table.coefficients[order] * dt * f(eqIdx); for (int orderIdx = 0; orderIdx < order; ++orderIdx) { - f(eqIdx) += - table.coefficients[order - 1 - orderIdx] * yn(eqIdx, orderIdx); + f(eqIdx) += table.coefficients[order - 1 - orderIdx] * yn(eqIdx, orderIdx); } } } @@ -111,8 +101,7 @@ struct BDF_system_wrapper { for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { for (int colIdx = 0; colIdx < neqs; ++colIdx) { - jac(rowIdx, colIdx) = - -table.coefficients[order] * dt * jac(rowIdx, colIdx); + jac(rowIdx, colIdx) = -table.coefficients[order] * dt * jac(rowIdx, colIdx); } jac(rowIdx, rowIdx) += 1.0; } @@ -130,13 +119,12 @@ struct BDF_system_wrapper2 { double t, dt, c = 0; KOKKOS_FUNCTION - BDF_system_wrapper2(const system_type& mySys_, const subview_type& psi_, - const d_vec_type& d_, const double t_, const double dt_) + BDF_system_wrapper2(const system_type& mySys_, const subview_type& psi_, const d_vec_type& d_, const double t_, + const double dt_) : mySys(mySys_), neqs(mySys_.neqs), psi(psi_), d(d_), t(t_), dt(dt_) {} template - KOKKOS_FUNCTION void residual(const YVectorType& y, - const FVectorType& f) const { + KOKKOS_FUNCTION void residual(const YVectorType& y, const FVectorType& f) const { // f = f(t+dt, y) mySys.evaluate_function(t, dt, y, f); @@ -165,14 +153,10 @@ struct BDF_system_wrapper2 { } }; -template -KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, - scalar_type t, scalar_type dt, - const vec_type& y_old, const vec_type& y_new, - const vec_type& rhs, const vec_type& update, - const vec_type& scale, const mv_type& y_vecs, - const mat_type& temp, const mat_type& jac) { +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, scalar_type t, scalar_type dt, + const vec_type& y_old, const vec_type& y_new, const vec_type& rhs, const vec_type& update, + const vec_type& scale, const mv_type& y_vecs, const mat_type& temp, const mat_type& jac) { using newton_params = KokkosODE::Experimental::Newton_params; BDF_system_wrapper sys(ode, table, t, dt, y_vecs); @@ -184,57 +168,43 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, } // solver the nonlinear problem - { - KokkosODE::Experimental::Newton::Solve(sys, param, jac, temp, y_new, rhs, - update, scale); - } + { KokkosODE::Experimental::Newton::Solve(sys, param, jac, temp, y_new, rhs, update, scale); } } // BDFStep template -KOKKOS_FUNCTION void compute_coeffs(const int order, const scalar_type factor, - const mat_type& coeffs) { +KOKKOS_FUNCTION void compute_coeffs(const int order, const scalar_type factor, const mat_type& coeffs) { coeffs(0, 0) = 1.0; for (int colIdx = 0; colIdx < order; ++colIdx) { coeffs(0, colIdx + 1) = 1.0; for (int rowIdx = 0; rowIdx < order; ++rowIdx) { coeffs(rowIdx + 1, colIdx + 1) = - ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * - coeffs(rowIdx, colIdx + 1); + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * coeffs(rowIdx, colIdx + 1); } } } template -KOKKOS_FUNCTION void update_D(const int order, const scalar_type factor, - const mat_type& coeffs, const mat_type& tempD, +KOKKOS_FUNCTION void update_D(const int order, const scalar_type factor, const mat_type& coeffs, const mat_type& tempD, const mat_type& D) { - auto subD = - Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(0, order + 1)); - auto subTempD = Kokkos::subview(tempD, Kokkos::ALL(), - Kokkos::pair(0, order + 1)); + auto subD = Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(0, order + 1)); + auto subTempD = Kokkos::subview(tempD, Kokkos::ALL(), Kokkos::pair(0, order + 1)); compute_coeffs(order, factor, coeffs); - auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); - KokkosBatched::SerialGemm< - KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subD, R, 0.0, subTempD); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm::invoke(1.0, subD, R, 0.0, subTempD); compute_coeffs(order, 1.0, coeffs); - auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); - KokkosBatched::SerialGemm< - KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subTempD, U, 0.0, subD); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm::invoke(1.0, subTempD, U, 0.0, subD); } -template -KOKKOS_FUNCTION void initial_step_size( - const ode_type ode, const int order, const scalar_type t0, - const scalar_type atol, const scalar_type rtol, const vec_type& y0, - const res_type& f0, const mat_type& temp, scalar_type& dt_ini) { +template +KOKKOS_FUNCTION void initial_step_size(const ode_type ode, const int order, const scalar_type t0, + const scalar_type atol, const scalar_type rtol, const vec_type& y0, + const res_type& f0, const mat_type& temp, scalar_type& dt_ini) { using KAT = Kokkos::ArithTraits; // Extract subviews to store intermediate data @@ -290,16 +260,12 @@ KOKKOS_FUNCTION void initial_step_size( } } // initial_step_size -template -KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, - scalar_type t_end, int& order, - int& num_equal_steps, const int max_newton_iters, - const scalar_type atol, const scalar_type rtol, - const scalar_type min_factor, - const vec_type& y_old, const vec_type& y_new, - const res_type& rhs, const res_type& update, - const mat_type& temp, const mat_type& temp2) { +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, scalar_type t_end, int& order, + int& num_equal_steps, const int max_newton_iters, const scalar_type atol, + const scalar_type rtol, const scalar_type min_factor, const vec_type& y_old, + const vec_type& y_new, const res_type& rhs, const res_type& update, const mat_type& temp, + const mat_type& temp2) { using newton_params = KokkosODE::Experimental::Newton_params; constexpr int max_order = 5; @@ -310,10 +276,8 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // kappa gamma(i) = sum_{k=1}^i(1.0 / k); gamma(0) = 0; // NDF coefficients // gamma_k alpha(i) = (1 - kappa(i)) * gamma(i) error_const(i) = kappa(i) * // gamma(i) + 1 / (i + 1) - const Kokkos::Array alpha{ - {0., 1.185, 1.66666667, 1.98421667, 2.16979167, 2.28333333}}; - const Kokkos::Array error_const{ - {1., 0.315, 0.16666667, 0.09911667, 0.11354167, 0.16666667}}; + const Kokkos::Array alpha{{0., 1.185, 1.66666667, 1.98421667, 2.16979167, 2.28333333}}; + const Kokkos::Array error_const{{1., 0.315, 0.16666667, 0.09911667, 0.11354167, 0.16666667}}; // Extract columns of temp to form temporary // subviews to operate on. @@ -322,12 +286,9 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // numCols << std::endl; std::cout << "Extract subview from temp" << // std::endl; int offset = 2; - auto D = Kokkos::subview( - temp, Kokkos::ALL(), - Kokkos::pair(offset, offset + 8)); // y and its derivatives + auto D = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(offset, offset + 8)); // y and its derivatives offset += 8; - auto tempD = Kokkos::subview(temp, Kokkos::ALL(), - Kokkos::pair(offset, offset + 8)); + auto tempD = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(offset, offset + 8)); offset += 8; auto scale = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); ++offset; // Scaling coefficients for error calculation @@ -337,31 +298,26 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, ++offset; // Higher order terms contribution to rhs auto error = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); ++offset; // Error estimate - auto jac = Kokkos::subview( - temp, Kokkos::ALL(), - Kokkos::pair(offset, offset + ode.neqs)); // Jacobian matrix + auto jac = + Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(offset, offset + ode.neqs)); // Jacobian matrix offset += ode.neqs; auto tmp_gesv = Kokkos::subview( - temp, Kokkos::ALL(), - Kokkos::pair( - offset, offset + ode.neqs + 4)); // Buffer space for gesv calculation + temp, Kokkos::ALL(), Kokkos::pair(offset, offset + ode.neqs + 4)); // Buffer space for gesv calculation offset += ode.neqs + 4; - auto coeffs = - Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::pair(0, 6)); - auto gamma = Kokkos::subview(temp2, Kokkos::ALL(), 6); - gamma(0) = 0.0; - gamma(1) = 1.0; - gamma(2) = 1.5; - gamma(3) = 1.83333333; - gamma(4) = 2.08333333; - gamma(5) = 2.28333333; + auto coeffs = Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::pair(0, 6)); + auto gamma = Kokkos::subview(temp2, Kokkos::ALL(), 6); + gamma(0) = 0.0; + gamma(1) = 1.0; + gamma(2) = 1.5; + gamma(3) = 1.83333333; + gamma(4) = 2.08333333; + gamma(5) = 2.28333333; BDF_system_wrapper2 sys(ode, psi, update, t, dt); const newton_params param( max_newton_iters, atol, - Kokkos::max(10 * Kokkos::ArithTraits::eps() / rtol, - Kokkos::min(0.03, Kokkos::sqrt(rtol)))); + Kokkos::max(10 * Kokkos::ArithTraits::eps() / rtol, Kokkos::min(0.03, Kokkos::sqrt(rtol)))); scalar_type max_step = Kokkos::ArithTraits::max(); scalar_type min_step = Kokkos::ArithTraits::min(); @@ -406,12 +362,9 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // Compute psi, the sum of the higher order // contribution to the residual - auto subD = - Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(1, order + 1)); - auto subGamma = - Kokkos::subview(gamma, Kokkos::pair(1, order + 1)); - KokkosBlas::Experimental::serial_gemv('N', 1.0 / alpha[order], subD, - subGamma, 0.0, psi); + auto subD = Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(1, order + 1)); + auto subGamma = Kokkos::subview(gamma, Kokkos::pair(1, order + 1)); + KokkosBlas::Experimental::serial_gemv('N', 1.0 / alpha[order], subD, subGamma, 0.0, psi); sys.compute_jac = true; sys.c = dt / alpha[order]; @@ -420,23 +373,20 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, Kokkos::Experimental::local_deep_copy(y_new, y_predict); Kokkos::Experimental::local_deep_copy(update, 0); KokkosODE::Experimental::newton_solver_status newton_status = - KokkosODE::Experimental::Newton::Solve(sys, param, jac, tmp_gesv, y_new, - rhs, update, scale); + KokkosODE::Experimental::Newton::Solve(sys, param, jac, tmp_gesv, y_new, rhs, update, scale); for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { update(eqIdx) = y_new(eqIdx) - y_predict(eqIdx); } - if (newton_status == - KokkosODE::Experimental::newton_solver_status::MAX_ITER) { + if (newton_status == KokkosODE::Experimental::newton_solver_status::MAX_ITER) { dt = 0.5 * dt; update_D(order, 0.5, coeffs, tempD, D); num_equal_steps = 0; } else { // Estimate the solution error - safety = 0.9 * (2 * max_newton_iters + 1) / - (2 * max_newton_iters + param.iters); + safety = 0.9 * (2 * max_newton_iters + 1) / (2 * max_newton_iters + param.iters); error_norm = 0; for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { scale(eqIdx) = atol + rtol * Kokkos::abs(y_new(eqIdx)); @@ -447,9 +397,8 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, // Check error norm and adapt step size or accept step if (error_norm > 1) { - scalar_type factor = Kokkos::max( - min_factor, safety * Kokkos::pow(error_norm, -1.0 / (order + 1))); - dt = factor * dt; + scalar_type factor = Kokkos::max(min_factor, safety * Kokkos::pow(error_norm, -1.0 / (order + 1))); + dt = factor * dt; update_D(order, factor, coeffs, tempD, D); num_equal_steps = 0; } else { @@ -483,8 +432,7 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, if (1 < order) { for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { - error_low += Kokkos::pow( - error_const[order - 1] * D(eqIdx, order) / scale(eqIdx), 2); + error_low += Kokkos::pow(error_const[order - 1] * D(eqIdx, order) / scale(eqIdx), 2); } error_low = Kokkos::sqrt(error_low) / Kokkos::sqrt(sys.neqs); } else { @@ -493,8 +441,7 @@ KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, if (order < max_order) { for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { - error_high += Kokkos::pow( - error_const[order + 1] * D(eqIdx, order + 2) / scale(eqIdx), 2); + error_high += Kokkos::pow(error_const[order + 1] * D(eqIdx, order + 2) / scale(eqIdx), 2); } error_high = Kokkos::sqrt(error_high) / Kokkos::sqrt(sys.neqs); } else { diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp index ae573801ac..1ca545689a 100644 --- a/ode/impl/KokkosODE_Newton_impl.hpp +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -30,19 +30,18 @@ namespace KokkosODE { namespace Impl { -template +template KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( - system_type& sys, const KokkosODE::Experimental::Newton_params& params, - mat_type& J, mat_type& tmp, ini_vec_type& y0, rhs_vec_type& rhs, - update_type& update, const scale_type& scale) { + system_type& sys, const KokkosODE::Experimental::Newton_params& params, mat_type& J, mat_type& tmp, + ini_vec_type& y0, rhs_vec_type& rhs, update_type& update, const scale_type& scale) { using newton_solver_status = KokkosODE::Experimental::newton_solver_status; using value_type = typename ini_vec_type::non_const_value_type; // Define the type returned by nrm2 to store // the norm of the residual. - using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename ini_vec_type::non_const_value_type>::mag_type; + using norm_type = + typename Kokkos::Details::InnerProductSpaceTraits::mag_type; sys.residual(y0, rhs); const norm_type norm0 = KokkosBlas::serial_nrm2(rhs); norm_type norm = Kokkos::ArithTraits::zero(); @@ -50,9 +49,8 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( norm_type norm_new = Kokkos::ArithTraits::zero(); norm_type rate = Kokkos::ArithTraits::zero(); - const norm_type tol = - Kokkos::max(10 * Kokkos::ArithTraits::eps() / params.rel_tol, - Kokkos::min(0.03, Kokkos::sqrt(params.rel_tol))); + const norm_type tol = Kokkos::max(10 * Kokkos::ArithTraits::eps() / params.rel_tol, + Kokkos::min(0.03, Kokkos::sqrt(params.rel_tol))); // LBV - 07/24/2023: for now assume that we take // a full Newton step. Eventually this value can @@ -73,9 +71,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( sys.jacobian(y0, J); // solve linear problem - int linSolverStat = - KokkosBatched::SerialGesv::invoke( - J, update, rhs, tmp); + int linSolverStat = KokkosBatched::SerialGesv::invoke(J, update, rhs, tmp); KokkosBlas::SerialScale::invoke(-1, update); // update solution // x = x + alpha*update @@ -89,9 +85,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( norm_new = Kokkos::sqrt(norm_new / sys.neqs); if ((it > 0) && norm_old > Kokkos::ArithTraits::zero()) { rate = norm_new / norm_old; - if ((rate >= 1) || - Kokkos::pow(rate, params.max_iters - it) / (1 - rate) * norm_new > - tol) { + if ((rate >= 1) || Kokkos::pow(rate, params.max_iters - it) / (1 - rate) * norm_new > tol) { return newton_solver_status::NLS_DIVERGENCE; } else if ((norm_new == 0) || ((rate / (1 - rate)) * norm_new < tol)) { return newton_solver_status::NLS_SUCCESS; @@ -103,8 +97,7 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( return newton_solver_status::LIN_SOLVE_FAIL; } - if ((norm < (params.rel_tol * norm0)) || - (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { + if ((norm < (params.rel_tol * norm0)) || (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { return newton_solver_status::NLS_SUCCESS; } diff --git a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp index 85a8ec0b45..6a0770d1a7 100644 --- a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp +++ b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp @@ -83,8 +83,7 @@ struct ButcherTableau<1, 1> // Euler-Heun Method static constexpr int order = 2; static constexpr int nstages = 2; // total dimensions, nstagesxnstages system Kokkos::Array a{ - {0.0, 1.0, - 0.0}}; //(nstages*nstages+nstages)/2 size of lower triangular matrix + {0.0, 1.0, 0.0}}; //(nstages*nstages+nstages)/2 size of lower triangular matrix Kokkos::Array b{{0.5, 0.5}}; Kokkos::Array c{{0.0, 1.0}}; Kokkos::Array e{{-0.5, 0.5}}; @@ -100,12 +99,10 @@ struct ButcherTableau<1, 2> // Known as Fehlberg 1-2 method { static constexpr int order = 2; static constexpr int nstages = 3; - Kokkos::Array a{ - {0.0, 0.5, 0.0, 1.0 / 256.0, 255.0 / 256.0, 0.0}}; + Kokkos::Array a{{0.0, 0.5, 0.0, 1.0 / 256.0, 255.0 / 256.0, 0.0}}; Kokkos::Array b{{1.0 / 512.0, 255.0 / 256.0, 1. / 512}}; Kokkos::Array c{{0.0, 1.0 / 2.0, 1.0}}; - Kokkos::Array e{ - {1.0 / 256.0 - 1.0 / 512.0, 0.0, -1.0 / 512.0}}; + Kokkos::Array e{{1.0 / 256.0 - 1.0 / 512.0, 0.0, -1.0 / 512.0}}; }; // Coefficients obtained from: @@ -119,12 +116,10 @@ struct ButcherTableau<2, 3> // Bogacki-Shampine method static constexpr int order = 3; static constexpr int nstages = 4; Kokkos::Array a{ - {0.0, 0.5, 0.0, 0.0, 3.0 / 4.0, 0.0, 2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, - 0.0}}; + {0.0, 0.5, 0.0, 0.0, 3.0 / 4.0, 0.0, 2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, 0.0}}; Kokkos::Array b{{2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, 0.0}}; Kokkos::Array c{{0.0, 0.5, 0.75, 1.0}}; - Kokkos::Array e{{2.0 / 9.0 - 7.0 / 24.0, 1.0 / 3.0 - 0.25, - 4.0 / 9.0 - 1.0 / 3.0, -1.0 / 8.0}}; + Kokkos::Array e{{2.0 / 9.0 - 7.0 / 24.0, 1.0 / 3.0 - 0.25, 4.0 / 9.0 - 1.0 / 3.0, -1.0 / 8.0}}; }; // Coefficients obtained from: @@ -136,10 +131,8 @@ struct ButcherTableau<3, 3> // RK4 { static constexpr int order = 4; static constexpr int nstages = 4; - Kokkos::Array a{ - {0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 1.0, 0.0}}; - Kokkos::Array b{ - {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}}; + Kokkos::Array a{{0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 1.0, 0.0}}; + Kokkos::Array b{{1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}}; Kokkos::Array c{{0.0, 0.5, 0.5, 1.0}}; Kokkos::Array e{{1.0 / 6.0, 0.0, -1.0 / 3.0, 1.0 / 6.0}}; }; @@ -175,14 +168,10 @@ struct ButcherTableau<4, 5> // Fehlberg Method 1859.0 / 4104.0, -11.0 / 40.0, 0.0}}; - Kokkos::Array b{{16.0 / 135.0, 0.0, 6656.0 / 12825.0, - 28561.0 / 56430.0, -9.0 / 50.0, - 2.0 / 55.0}}; - Kokkos::Array c{ - {0.0, 0.25, 3.0 / 8.0, 12.0 / 13.0, 1.0, 0.5}}; - Kokkos::Array e{ - {16.0 / 135.0 - 25.0 / 216.0, 0.0, 6656.0 / 12825.0 - 1408.0 / 2565.0, - 28561.0 / 56430.0 - 2197.0 / 4104.0, -9.0 / 50.0 + 0.2, 2.0 / 55.0}}; + Kokkos::Array b{{16.0 / 135.0, 0.0, 6656.0 / 12825.0, 28561.0 / 56430.0, -9.0 / 50.0, 2.0 / 55.0}}; + Kokkos::Array c{{0.0, 0.25, 3.0 / 8.0, 12.0 / 13.0, 1.0, 0.5}}; + Kokkos::Array e{{16.0 / 135.0 - 25.0 / 216.0, 0.0, 6656.0 / 12825.0 - 1408.0 / 2565.0, + 28561.0 / 56430.0 - 2197.0 / 4104.0, -9.0 / 50.0 + 0.2, 2.0 / 55.0}}; }; // Coefficients obtained from: @@ -195,35 +184,31 @@ struct ButcherTableau<4, 5, 1> // Cash-Karp { static constexpr int order = 5; static constexpr int nstages = 6; - Kokkos::Array a{ - {0.0, - 0.2, - 0.0, - 3.0 / 40.0, - 9.0 / 40.0, - 0.0, - 0.3, - -0.9, - 1.2, - 0.0, - -11.0 / 54.0, - 2.5, - -70.0 / 27.0, - 35.0 / 27.0, - 0.0, - 1631.0 / 55296.0, - 175.0 / 512.0, - 575.0 / 13824.0, - 44275.0 / 110592.0, - 253.0 / 4096.0, - 0.0}}; - Kokkos::Array b{ - {37.0 / 378.0, 0.0, 250.0 / 621.0, 125.0 / 594.0, 0.0, 512.0 / 1771.0}}; + Kokkos::Array a{{0.0, + 0.2, + 0.0, + 3.0 / 40.0, + 9.0 / 40.0, + 0.0, + 0.3, + -0.9, + 1.2, + 0.0, + -11.0 / 54.0, + 2.5, + -70.0 / 27.0, + 35.0 / 27.0, + 0.0, + 1631.0 / 55296.0, + 175.0 / 512.0, + 575.0 / 13824.0, + 44275.0 / 110592.0, + 253.0 / 4096.0, + 0.0}}; + Kokkos::Array b{{37.0 / 378.0, 0.0, 250.0 / 621.0, 125.0 / 594.0, 0.0, 512.0 / 1771.0}}; Kokkos::Array c{{0.0, 0.2, 0.3, 0.6, 1.0, 7.0 / 8.0}}; - Kokkos::Array e{{37.0 / 378.0 - 2825.0 / 27648.0, 0.0, - 250.0 / 621.0 - 18575.0 / 48384.0, - 125.0 / 594.0 - 13525.0 / 55296.0, - -277.0 / 14336.0, 512.0 / 1771.0 - 0.25}}; + Kokkos::Array e{{37.0 / 378.0 - 2825.0 / 27648.0, 0.0, 250.0 / 621.0 - 18575.0 / 48384.0, + 125.0 / 594.0 - 13525.0 / 55296.0, -277.0 / 14336.0, 512.0 / 1771.0 - 0.25}}; }; // Coefficients obtained from: @@ -264,14 +249,12 @@ struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP -2187.0 / 6784.0, 11.0 / 84.0, 0.0}}; - Kokkos::Array b{{35.0 / 384.0, 0.0, 500.0 / 1113.0, - 125.0 / 192.0, -2187.0 / 6784.0, - 11.0 / 84.0, 0.0}}; + Kokkos::Array b{ + {35.0 / 384.0, 0.0, 500.0 / 1113.0, 125.0 / 192.0, -2187.0 / 6784.0, 11.0 / 84.0, 0.0}}; Kokkos::Array c{{0.0, 0.2, 0.3, 0.8, 8.0 / 9.0, 1.0, 1.0}}; - Kokkos::Array e{ - {35.0 / 384.0 - 5179.0 / 57600.0, 0.0, 500.0 / 1113.0 - 7571.0 / 16695.0, - 125.0 / 192.0 - 393.0 / 640.0, -2187.0 / 6784.0 + 92097.0 / 339200.0, - 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; + Kokkos::Array e{{35.0 / 384.0 - 5179.0 / 57600.0, 0.0, 500.0 / 1113.0 - 7571.0 / 16695.0, + 125.0 / 192.0 - 393.0 / 640.0, -2187.0 / 6784.0 + 92097.0 / 339200.0, + 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; }; } // namespace Impl diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp index f5fe39d65d..83ab76758f 100644 --- a/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -30,12 +30,9 @@ namespace Impl { // k_i = f(t+c_i*dt, y_old+sum(a_{ij}*k_i)) j in [1, i-1] // we need to compute the k_i and store them as we go // to use them for k_{i+1} computation. -template -KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, - const bool adaptivity, scalar_type t, - scalar_type dt, const vec_type& y_old, - const vec_type& y_new, const vec_type& temp, +template +KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, const bool adaptivity, scalar_type t, + scalar_type dt, const vec_type& y_old, const vec_type& y_new, const vec_type& temp, const mv_type& k_vecs) { const int neqs = ode.neqs; const int nstages = table.nstages; @@ -64,8 +61,7 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, for (int idx = 0; idx < stageIdx; ++idx) { for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { - temp(eqIdx) += - table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(idx, eqIdx); + temp(eqIdx) += table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(idx, eqIdx); } } KokkosBlas::SerialScale::invoke(dt, temp); @@ -88,13 +84,12 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, } } // RKStep -template -KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( - const ode_type& ode, const table_type& table, - const KokkosODE::Experimental::ODE_params& params, - const scalar_type t_start, const scalar_type t_end, const vec_type& y0, - const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { +template +KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, const table_type& table, + const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, + const vec_type& y0, const vec_type& y, const vec_type& temp, + const mv_type& k_vecs) { constexpr scalar_type error_threshold = 1; bool adapt = params.adaptivity; bool dt_was_reduced; @@ -107,8 +102,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( scalar_type dt = (t_end - t_start) / params.max_steps; // Loop over time steps to integrate ODE - for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); - ++stepIdx) { + for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); ++stepIdx) { // Check that the step attempted is not putting // the solution past t_end, otherwise shrink dt if (t_end < t_now + dt) { @@ -138,9 +132,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { error = Kokkos::max(error, Kokkos::abs(temp(eqIdx))); tol = Kokkos::max( - tol, params.abs_tol + - params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), - Kokkos::abs(y0(eqIdx)))); + tol, params.abs_tol + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), Kokkos::abs(y0(eqIdx)))); } error = error / tol; @@ -148,12 +140,11 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( // is too large and current step // is rejected. if (error > 1) { - dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); dt_was_reduced = true; } - if (dt < params.min_step_size) - return Experimental::ode_solver_status::MIN_SIZE; + if (dt < params.min_step_size) return Experimental::ode_solver_status::MIN_SIZE; } } @@ -166,10 +157,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( if (t_now < t_end) { if (adapt && !dt_was_reduced && error < 0.5) { // Compute new time increment - dt = dt * - Kokkos::min( - 10.0, - Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); + dt = dt * Kokkos::min(10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); } } else { return Experimental::ode_solver_status::SUCCESS; diff --git a/ode/src/KokkosODE_BDF.hpp b/ode/src/KokkosODE_BDF.hpp index 71a450a1c6..419316ba45 100644 --- a/ode/src/KokkosODE_BDF.hpp +++ b/ode/src/KokkosODE_BDF.hpp @@ -29,14 +29,7 @@ namespace KokkosODE { namespace Experimental { -enum BDF_type : int { - BDF1 = 0, - BDF2 = 1, - BDF3 = 2, - BDF4 = 3, - BDF5 = 4, - BDF6 = 5 -}; +enum BDF_type : int { BDF1 = 0, BDF2 = 1, BDF3 = 2, BDF4 = 3, BDF5 = 4, BDF6 = 5 }; template struct BDF_coeff_helper { @@ -91,14 +84,11 @@ template struct BDF { using table_type = typename BDF_coeff_helper::table_type; - template - KOKKOS_FUNCTION static void Solve( - const ode_type& ode, const scalar_type t_start, const scalar_type t_end, - const int num_steps, const vec_type& y0, const vec_type& y, - const vec_type& rhs, const vec_type& update, const vec_type& scale, - const mv_type& y_vecs, const mv_type& kstack, const mat_type& temp, - const mat_type& jac) { + template + KOKKOS_FUNCTION static void Solve(const ode_type& ode, const scalar_type t_start, const scalar_type t_end, + const int num_steps, const vec_type& y0, const vec_type& y, const vec_type& rhs, + const vec_type& update, const vec_type& scale, const mv_type& y_vecs, + const mv_type& kstack, const mat_type& temp, const mat_type& jac) { const table_type table{}; const double dt = (t_end - t_start) / num_steps; @@ -117,8 +107,7 @@ struct BDF { } KokkosODE::Experimental::ODE_params params(table.order - 1); for (int stepIdx = 0; stepIdx < init_steps; ++stepIdx) { - KokkosODE::Experimental::RungeKutta::Solve( - ode, params, t, t + dt, y0, y, update, kstack); + KokkosODE::Experimental::RungeKutta::Solve(ode, params, t, t + dt, y0, y, update, kstack); for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y_vecs(eqIdx, stepIdx + 1) = y(eqIdx); @@ -128,8 +117,7 @@ struct BDF { } for (int stepIdx = init_steps; stepIdx < num_steps; ++stepIdx) { - KokkosODE::Impl::BDFStep(ode, table, t, dt, y0, y, rhs, update, scale, - y_vecs, temp, jac); + KokkosODE::Impl::BDFStep(ode, table, t, dt, y0, y, rhs, update, scale, y_vecs, temp, jac); // Update history for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { @@ -167,12 +155,9 @@ struct BDF { /// \param temp [in]: vectors for temporary storage /// \param temp2 [in]: vectors for temporary storage template -KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, - const scalar_type t_end, - const scalar_type initial_step, - const scalar_type max_step, const vec_type& y0, - const vec_type& y_new, mat_type& temp, - mat_type& temp2) { +KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, const scalar_type t_end, + const scalar_type initial_step, const scalar_type max_step, const vec_type& y0, + const vec_type& y_new, mat_type& temp, mat_type& temp2) { using KAT = Kokkos::ArithTraits; // This needs to go away and be pulled out of temp instead... @@ -195,8 +180,7 @@ KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, // Check if we need to compute the initial // time step size. if (initial_step == KAT::zero()) { - KokkosODE::Impl::initial_step_size(ode, order, t_start, atol, rtol, y0, rhs, - temp, dt); + KokkosODE::Impl::initial_step_size(ode, order, t_start, atol, rtol, y0, rhs, temp, dt); } // Initialize D(:, 0) = y0 and D(:, 1) = dt*rhs @@ -210,8 +194,7 @@ KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, // Now we loop over the time interval [t_start, t_end] // and solve our ODE. while (t < t_end) { - KokkosODE::Impl::BDFStep(ode, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, min_factor, y0, + KokkosODE::Impl::BDFStep(ode, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, min_factor, y0, y_new, rhs, update, temp, temp2); for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { diff --git a/ode/src/KokkosODE_Newton.hpp b/ode/src/KokkosODE_Newton.hpp index ffccba5cd3..5686423e9e 100644 --- a/ode/src/KokkosODE_Newton.hpp +++ b/ode/src/KokkosODE_Newton.hpp @@ -30,14 +30,13 @@ namespace Experimental { /// \brief Newton solver for non-linear system of equations struct Newton { - template - KOKKOS_FUNCTION static newton_solver_status Solve( - const system_type& sys, const Newton_params& params, const mat_type& J, - const mat_type& tmp, const ini_vec_type& y0, const rhs_vec_type& rhs, - const update_type& update, const scale_type& scale) { - return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update, - scale); + template + KOKKOS_FUNCTION static newton_solver_status Solve(const system_type& sys, const Newton_params& params, + const mat_type& J, const mat_type& tmp, const ini_vec_type& y0, + const rhs_vec_type& rhs, const update_type& update, + const scale_type& scale) { + return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update, scale); } }; diff --git a/ode/src/KokkosODE_RungeKutta.hpp b/ode/src/KokkosODE_RungeKutta.hpp index b4711de81c..2d298a6568 100644 --- a/ode/src/KokkosODE_RungeKutta.hpp +++ b/ode/src/KokkosODE_RungeKutta.hpp @@ -31,8 +31,8 @@ namespace Experimental { /// \brief RK_type is an enum tye that conveniently /// describes the Runge-Kutta methods implemented. enum RK_type : int { - RKFE = 0, ///< Forward Euler method (no adaptivity available for this method) - RKEH = 1, ///< Euler-Heun method + RKFE = 0, ///< Forward Euler method (no adaptivity available for this method) + RKEH = 1, ///< Euler-Heun method RKF12 = 2, ///< Fehlberg order 2 method RKBS = 3, ///< Bogacki-Shampine method RK4 = 4, ///< Runge-Kutta classic order 4 method @@ -126,13 +126,11 @@ struct RungeKutta { /// \return ode_solver_status an enum that describes success of failure /// of the integration method once it at terminated. template - KOKKOS_FUNCTION static ode_solver_status Solve( - const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, - const scalar_type t_start, const scalar_type t_end, const vec_type& y0, - const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + KOKKOS_FUNCTION static ode_solver_status Solve(const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, const vec_type& y0, + const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { table_type table; - return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, - temp, k_vecs); + return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, temp, k_vecs); } }; diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp index 5fb2c44846..2145afb718 100644 --- a/ode/src/KokkosODE_Types.hpp +++ b/ode/src/KokkosODE_Types.hpp @@ -32,17 +32,12 @@ struct ODE_params { // be constant such that dt = (tend - tstart) / num_steps; KOKKOS_FUNCTION ODE_params(const int num_steps_) - : adaptivity(false), - num_steps(num_steps_), - max_steps(num_steps_), - abs_tol(0), - rel_tol(0), - min_step_size(0) {} + : adaptivity(false), num_steps(num_steps_), max_steps(num_steps_), abs_tol(0), rel_tol(0), min_step_size(0) {} /// ODE_parms construtor for adaptive time stepping. KOKKOS_FUNCTION - ODE_params(const int num_steps_, const int max_steps_, const double abs_tol_, - const double rel_tol_, const double min_step_size_) + ODE_params(const int num_steps_, const int max_steps_, const double abs_tol_, const double rel_tol_, + const double min_step_size_) : adaptivity(true), num_steps(num_steps_), max_steps(max_steps_), @@ -68,8 +63,7 @@ struct Newton_params { // double abs_tol_ [in]: absolute tolerance to reach for successful solve // double rel_tol_ [in]: relative tolerance to reach for successful solve KOKKOS_FUNCTION - Newton_params(const int max_iters_, const double abs_tol_, - const double rel_tol_) + Newton_params(const int max_iters_, const double abs_tol_, const double rel_tol_) : max_iters(max_iters_), abs_tol(abs_tol_), rel_tol(rel_tol_) {} }; diff --git a/ode/unit_test/Test_ODE_BDF.hpp b/ode/unit_test/Test_ODE_BDF.hpp index 8360302971..8f8319cb1d 100644 --- a/ode/unit_test/Test_ODE_BDF.hpp +++ b/ode/unit_test/Test_ODE_BDF.hpp @@ -37,23 +37,19 @@ struct Logistic { Logistic(double r_, double K_) : r(r_), K(K_){}; template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = r * y(0) * (1.0 - y(0) / K); } template - KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, - const double /*dt*/, const vec_type& y, + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, const double /*dt*/, const vec_type& y, const mat_type& jac) const { jac(0, 0) = r - 2 * r * y(0) / K; } template - KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, - const vec_type& y) const { + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, const vec_type& y) const { y(0) = K / (1 + (K - y0) / y0 * Kokkos::exp(-r * t)); } @@ -78,17 +74,14 @@ struct LotkaVolterra { : alpha(alpha_), beta(beta_), delta(delta_), gamma(gamma_){}; template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = alpha * y(0) - beta * y(0) * y(1); f(1) = delta * y(0) * y(1) - gamma * y(1); } template - KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, - const double /*dt*/, const vec_type& y, + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, const double /*dt*/, const vec_type& y, const mat_type& jac) const { jac(0, 0) = alpha - beta * y(1); jac(0, 1) = -beta * y(0); @@ -112,9 +105,7 @@ struct StiffChemistry { StiffChemistry() {} template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); f(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); @@ -122,8 +113,7 @@ struct StiffChemistry { } template - KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, - const double /*dt*/, const vec_type& y, + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, const double /*dt*/, const vec_type& y, const mat_type& jac) const { jac(0, 0) = -0.04; jac(0, 1) = 1.e4 * y(2); @@ -137,8 +127,8 @@ struct StiffChemistry { } }; -template +template struct BDFSolve_wrapper { ode_type my_ode; scalar_type tstart, tend; @@ -147,12 +137,9 @@ struct BDFSolve_wrapper { mv_type y_vecs, kstack; mat_type temp, jac; - BDFSolve_wrapper(const ode_type& my_ode_, const scalar_type tstart_, - const scalar_type tend_, const int num_steps_, - const vec_type& y_old_, const vec_type& y_new_, - const vec_type& rhs_, const vec_type& update_, - const vec_type& scale_, const mv_type& y_vecs_, - const mv_type& kstack_, const mat_type& temp_, + BDFSolve_wrapper(const ode_type& my_ode_, const scalar_type tstart_, const scalar_type tend_, const int num_steps_, + const vec_type& y_old_, const vec_type& y_new_, const vec_type& rhs_, const vec_type& update_, + const vec_type& scale_, const mv_type& y_vecs_, const mv_type& kstack_, const mat_type& temp_, const mat_type& jac_) : my_ode(my_ode_), tstart(tstart_), @@ -170,9 +157,8 @@ struct BDFSolve_wrapper { KOKKOS_FUNCTION void operator()(const int /*idx*/) const { - KokkosODE::Experimental::BDF::Solve( - my_ode, tstart, tend, num_steps, y_old, y_new, rhs, update, scale, - y_vecs, kstack, temp, jac); + KokkosODE::Experimental::BDF::Solve(my_ode, tstart, tend, num_steps, y_old, y_new, rhs, update, scale, + y_vecs, kstack, temp, jac); } }; @@ -183,11 +169,9 @@ struct BDF_Solve_wrapper { const vec_type y0, y_new; const mat_type temp, temp2; - BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, - const scalar_type& t_end_, const scalar_type& dt_, - const scalar_type& max_step_, const vec_type& y0_, - const vec_type& y_new_, const mat_type& temp_, - const mat_type& temp2_) + BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, const scalar_type& t_end_, + const scalar_type& dt_, const scalar_type& max_step_, const vec_type& y0_, const vec_type& y_new_, + const mat_type& temp_, const mat_type& temp2_) : my_ode(my_ode_), t_start(t_start_), t_end(t_end_), @@ -199,8 +183,7 @@ struct BDF_Solve_wrapper { temp2(temp2_) {} KOKKOS_FUNCTION void operator()(const int) const { - KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, y0, - y_new, temp, temp2); + KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, y0, y_new, temp, temp2); } }; @@ -221,8 +204,7 @@ void test_BDF_Logistic() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); vec_type scale("scaling factors", mySys.neqs); - mat_type jac("jacobian", mySys.neqs, mySys.neqs), - temp("temp storage", mySys.neqs, mySys.neqs + 4); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), temp("temp storage", mySys.neqs, mySys.neqs + 4); mv_type kstack("Startup RK vectors", 6, mySys.neqs); Kokkos::deep_copy(scale, 1); @@ -239,26 +221,21 @@ void test_BDF_Logistic() { Kokkos::deep_copy(y0, 0.5); Kokkos::deep_copy(y_vecs, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); EXPECT_NEAR_KK_REL(measured_order, 2.0, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 2, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 2.0) / 2.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 2.0) / 2.0 << std::endl; #endif // Test BDF2 @@ -269,26 +246,21 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 2); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); EXPECT_NEAR_KK_REL(measured_order, 4.0, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 4, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 4.0) / 4.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 4.0) / 4.0 << std::endl; #endif // Test BDF3 @@ -299,26 +271,21 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 3); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); EXPECT_NEAR_KK_REL(measured_order, 8.0, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 8, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 8.0) / 8.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 8.0) / 8.0 << std::endl; #endif // Test BDF4 @@ -329,25 +296,20 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 4); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 16, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 16.0) / 16.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 16.0) / 16.0 << std::endl; #endif // Test BDF5 @@ -358,25 +320,20 @@ void test_BDF_Logistic() { mv_type y_vecs("history vectors", mySys.neqs, 5); Kokkos::deep_copy(y0, 0.5); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, - update, scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); Kokkos::fence(); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / - Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); } - measured_order = - Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + measured_order = Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "expected ratio: 32, actual ratio: " << measured_order - << ", order error=" << Kokkos::abs(measured_order - 32.0) / 32.0 - << std::endl; + << ", order error=" << Kokkos::abs(measured_order - 32.0) / 32.0 << std::endl; #endif } // test_BDF_Logistic @@ -394,8 +351,7 @@ void test_BDF_LotkaVolterra() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); vec_type scale("scaling factors", mySys.neqs); - mat_type jac("jacobian", mySys.neqs, mySys.neqs), - temp("temp storage", mySys.neqs, mySys.neqs + 4); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), temp("temp storage", mySys.neqs, mySys.neqs + 4); Kokkos::deep_copy(scale, 1); @@ -407,10 +363,8 @@ void test_BDF_LotkaVolterra() { Kokkos::deep_copy(y_vecs, 10.0); Kokkos::RangePolicy myPolicy(0, 1); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, scale, - y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); } @@ -427,8 +381,7 @@ void test_BDF_StiffChemistry() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); vec_type scale("scaling factors", mySys.neqs); - mat_type jac("jacobian", mySys.neqs, mySys.neqs), - temp("temp storage", mySys.neqs, mySys.neqs + 4); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), temp("temp storage", mySys.neqs, mySys.neqs + 4); Kokkos::deep_copy(scale, 1); @@ -444,10 +397,8 @@ void test_BDF_StiffChemistry() { Kokkos::deep_copy(y_vecs, 0.0); Kokkos::RangePolicy myPolicy(0, 1); - BDFSolve_wrapper - solve_wrapper(mySys, t_start, t_end, 110000, y0, y_new, rhs, update, - scale, y_vecs, kstack, temp, jac); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 110000, y0, y_new, rhs, update, scale, y_vecs, kstack, temp, jac); Kokkos::parallel_for(myPolicy, solve_wrapper); } @@ -559,8 +510,7 @@ void test_BDF_StiffChemistry() { // } template -void compute_coeffs(const int order, const scalar_type factor, - const mat_type& coeffs) { +void compute_coeffs(const int order, const scalar_type factor, const mat_type& coeffs) { std::cout << "compute_coeffs" << std::endl; coeffs(0, 0) = 1.0; @@ -568,35 +518,28 @@ void compute_coeffs(const int order, const scalar_type factor, coeffs(0, colIdx + 1) = 1.0; for (int rowIdx = 0; rowIdx < order; ++rowIdx) { coeffs(rowIdx + 1, colIdx + 1) = - ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * - coeffs(rowIdx, colIdx + 1); + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * coeffs(rowIdx, colIdx + 1); } } } template -void update_D(const int order, const scalar_type factor, const mat_type& coeffs, - const mat_type& tempD, const mat_type& D) { - auto subD = - Kokkos::subview(D, Kokkos::pair(0, order + 1), Kokkos::ALL); - auto subTempD = - Kokkos::subview(tempD, Kokkos::pair(0, order + 1), Kokkos::ALL); +void update_D(const int order, const scalar_type factor, const mat_type& coeffs, const mat_type& tempD, + const mat_type& D) { + auto subD = Kokkos::subview(D, Kokkos::pair(0, order + 1), Kokkos::ALL); + auto subTempD = Kokkos::subview(tempD, Kokkos::pair(0, order + 1), Kokkos::ALL); compute_coeffs(order, factor, coeffs); - auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); std::cout << "SerialGemm" << std::endl; - KokkosBatched::SerialGemm< - KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, R, subD, 0.0, subTempD); + KokkosBatched::SerialGemm::invoke(1.0, R, subD, 0.0, subTempD); compute_coeffs(order, 1.0, coeffs); - auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), - Kokkos::pair(0, order + 1)); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), Kokkos::pair(0, order + 1)); std::cout << "SerialGemm" << std::endl; - KokkosBatched::SerialGemm< - KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, U, subTempD, 0.0, subD); + KokkosBatched::SerialGemm::invoke(1.0, U, subTempD, 0.0, subD); } template @@ -604,10 +547,8 @@ void test_Nordsieck() { using execution_space = Kokkos::HostSpace; StiffChemistry mySys{}; - Kokkos::View R("coeffs", 6, 6), - U("coeffs", 6, 6); - Kokkos::View D("D", 8, mySys.neqs), - tempD("tmp", 8, mySys.neqs); + Kokkos::View R("coeffs", 6, 6), U("coeffs", 6, 6); + Kokkos::View D("D", 8, mySys.neqs), tempD("tmp", 8, mySys.neqs); int order = 1; double factor = 0.8; @@ -639,17 +580,13 @@ void test_Nordsieck() { } std::cout << "D before update:" << std::endl; - std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" - << std::endl; - std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" - << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" << std::endl; update_D(order, factor, R, tempD, D); std::cout << "D after update:" << std::endl; - std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" - << std::endl; - std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" - << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" << std::endl; } template @@ -668,8 +605,7 @@ void test_adaptive_BDF() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); - mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), - temp2("buffer2", 6, 7); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), temp2("buffer2", 6, 7); // Initial condition Kokkos::deep_copy(y0, 0.5); @@ -688,13 +624,11 @@ void test_adaptive_BDF() { std::cout << "Initial conditions" << std::endl; std::cout << " y0=" << y0(0) << ", t=" << t << ", dt=" << dt << std::endl; - std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) - << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " - << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << ", " << D(0, 3) << ", " << D(0, 4) + << ", " << D(0, 5) << ", " << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; - KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, - update, temp, temp2); + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, 0.2, y0, y_new, + rhs, update, temp, temp2); for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { y0(eqIdx) = y_new(eqIdx); @@ -706,13 +640,11 @@ void test_adaptive_BDF() { std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; - std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) - << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " - << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << ", " << D(0, 3) << ", " << D(0, 4) + << ", " << D(0, 5) << ", " << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; - KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, - update, temp, temp2); + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, 0.2, y0, y_new, + rhs, update, temp, temp2); for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { y0(eqIdx) = y_new(eqIdx); @@ -724,13 +656,11 @@ void test_adaptive_BDF() { std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; - std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) - << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " - << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << ", " << D(0, 3) << ", " << D(0, 4) + << ", " << D(0, 5) << ", " << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; - KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, - max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, - update, temp, temp2); + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, max_newton_iters, atol, rtol, 0.2, y0, y_new, + rhs, update, temp, temp2); std::cout << "Final t: " << t << ", y=" << y_new(0) << std::endl; @@ -751,22 +681,18 @@ void test_adaptive_BDF_v2() { vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); Kokkos::deep_copy(y0, 0.5); - mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), - temp2("buffer2", 6, 7); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), temp2("buffer2", 6, 7); { scalar_type dt = KAT::zero(); vec_type f0("initial value f", mySys.neqs); mySys.evaluate_function(t_start, KAT::zero(), y0, f0); - KokkosODE::Impl::initial_step_size(mySys, 1, t_start, 1e-6, 1e-3, y0, f0, - temp, dt); + KokkosODE::Impl::initial_step_size(mySys, 1, t_start, 1e-6, 1e-3, y0, f0, temp, dt); std::cout << "Initial Step Size: dt=" << dt << std::endl; } - KokkosODE::Experimental::BDFSolve(mySys, t_start, t_end, 0.0117188, - (t_end - t_start) / 10, y0, y_new, temp, - temp2); + KokkosODE::Experimental::BDFSolve(mySys, t_start, t_end, 0.0117188, (t_end - t_start) / 10, y0, y_new, temp, temp2); } template @@ -789,42 +715,30 @@ void test_BDF_adaptive_stiff() { y0_h(2) = KAT::zero(); Kokkos::deep_copy(y0, y0_h); - mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), - temp2("buffer2", 6, 7); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), temp2("buffer2", 6, 7); Kokkos::RangePolicy policy(0, 1); - BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, - (t_end - t_start) / 10, y0, y_new, temp, temp2); + BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, (t_end - t_start) / 10, y0, y_new, temp, temp2); Kokkos::parallel_for(policy, bdf_wrapper); auto y_new_h = Kokkos::create_mirror_view(y_new); Kokkos::deep_copy(y_new_h, y_new); - std::cout << "Stiff Chemistry solution at t=500: {" << y_new_h(0) << ", " - << y_new_h(1) << ", " << y_new_h(2) << "}" << std::endl; + std::cout << "Stiff Chemistry solution at t=500: {" << y_new_h(0) << ", " << y_new_h(1) << ", " << y_new_h(2) << "}" + << std::endl; } } // namespace Test -TEST_F(TestCategory, BDF_Logistic_serial) { - ::Test::test_BDF_Logistic(); -} -TEST_F(TestCategory, BDF_LotkaVolterra_serial) { - ::Test::test_BDF_LotkaVolterra(); -} -TEST_F(TestCategory, BDF_StiffChemistry_serial) { - ::Test::test_BDF_StiffChemistry(); -} +TEST_F(TestCategory, BDF_Logistic_serial) { ::Test::test_BDF_Logistic(); } +TEST_F(TestCategory, BDF_LotkaVolterra_serial) { ::Test::test_BDF_LotkaVolterra(); } +TEST_F(TestCategory, BDF_StiffChemistry_serial) { ::Test::test_BDF_StiffChemistry(); } // TEST_F(TestCategory, BDF_parallel_serial) { // ::Test::test_BDF_parallel(); // } -TEST_F(TestCategory, BDF_Nordsieck) { - ::Test::test_Nordsieck(); -} +TEST_F(TestCategory, BDF_Nordsieck) { ::Test::test_Nordsieck(); } // TEST_F(TestCategory, BDF_adaptive) { // ::Test::test_adaptive_BDF(); // ::Test::test_adaptive_BDF_v2(); // } -TEST_F(TestCategory, BDF_StiffChemistry_adaptive) { - ::Test::test_BDF_adaptive_stiff(); -} +TEST_F(TestCategory, BDF_StiffChemistry_adaptive) { ::Test::test_BDF_adaptive_stiff(); } diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp index 45dd4adb6a..c37142ee8f 100644 --- a/ode/unit_test/Test_ODE_Newton.hpp +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -21,8 +21,7 @@ namespace Test { -template +template struct NewtonSolve_wrapper { using newton_params = KokkosODE::Experimental::Newton_params; @@ -35,11 +34,9 @@ struct NewtonSolve_wrapper { scale_type scale; - NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, - const vec_type& x_, const vec_type& rhs_, - const vec_type& update_, const mat_type& J_, - const mat_type& tmp_, const status_view& status_, - const scale_type& scale_) + NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, const vec_type& x_, + const vec_type& rhs_, const vec_type& update_, const mat_type& J_, const mat_type& tmp_, + const status_view& status_, const scale_type& scale_) : my_nls(my_nls_), params(params_), x(x_), @@ -54,38 +51,27 @@ struct NewtonSolve_wrapper { void operator()(const int idx) const { // Take subviews to create the local problem auto local_x = Kokkos::subview( - x, Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1)))); + x, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1)))); auto local_rhs = Kokkos::subview( - rhs, Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1)))); + rhs, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1)))); auto local_update = Kokkos::subview( - update, - Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1)))); + update, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1)))); auto local_J = Kokkos::subview( - J, - Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1))), + J, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1))), Kokkos::ALL()); auto local_tmp = Kokkos::subview( - tmp, - Kokkos::pair(static_cast(my_nls.neqs * idx), - static_cast(my_nls.neqs * (idx + 1))), + tmp, Kokkos::pair(static_cast(my_nls.neqs * idx), static_cast(my_nls.neqs * (idx + 1))), Kokkos::ALL()); // Run Newton nonlinear solver - status(idx) = KokkosODE::Experimental::Newton::Solve( - my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update, - scale); + status(idx) = KokkosODE::Experimental::Newton::Solve(my_nls, params, local_J, local_tmp, local_x, local_rhs, + local_update, scale); } }; template -void run_newton_test(const system_type& mySys, - KokkosODE::Experimental::Newton_params& params, - const scalar_type* const initial_val, - const scalar_type* const solution) { +void run_newton_test(const system_type& mySys, KokkosODE::Experimental::Newton_params& params, + const scalar_type* const initial_val, const scalar_type* const solution) { using execution_space = typename Device::execution_space; using newton_solver_status = KokkosODE::Experimental::newton_solver_status; using vec_type = typename Kokkos::View; @@ -96,14 +82,12 @@ void run_newton_test(const system_type& mySys, vec_type scale("scaling factors", mySys.neqs); Kokkos::deep_copy(scale, 1); - vec_type x("solution vector", mySys.neqs), - rhs("right hand side vector", mySys.neqs); + vec_type x("solution vector", mySys.neqs), rhs("right hand side vector", mySys.neqs); auto x_h = Kokkos::create_mirror_view(x); auto r_h = Kokkos::create_mirror_view(rhs); vec_type update("update", mySys.neqs); - mat_type J("jacobian", mySys.neqs, mySys.neqs), - tmp("temp mem", mySys.neqs, mySys.neqs + 4); + mat_type J("jacobian", mySys.neqs, mySys.neqs), tmp("temp mem", mySys.neqs, mySys.neqs + 4); // Initial values for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { @@ -112,8 +96,7 @@ void run_newton_test(const system_type& mySys, Kokkos::deep_copy(x, x_h); Kokkos::RangePolicy my_policy(0, 1); - NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status, scale); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); @@ -131,9 +114,7 @@ void run_newton_test(const system_type& mySys, } std::cout << " ), " << KokkosBlas::serial_nrm2(rhs) << ", ("; for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { - std::cout << " " - << Kokkos::abs(x_h(eqIdx) - solution[eqIdx]) / - Kokkos::abs(solution[eqIdx]); + std::cout << " " << Kokkos::abs(x_h(eqIdx) - solution[eqIdx]) / Kokkos::abs(solution[eqIdx]); } std::cout << " )]" << std::endl; #else @@ -154,13 +135,9 @@ struct QuadraticEquation { QuadraticEquation() {} - KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { - f(0) = y(0) * y(0) - y(0) - 2; - } + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { f(0) = y(0) * y(0) - y(0) - 2; } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = 2 * y(0) - 1; - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = 2 * y(0) - 1; } }; // Trigonometric equation @@ -176,13 +153,9 @@ struct TrigonometricEquation { TrigonometricEquation() {} - KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { - f(0) = Kokkos::cos(y(0)) - y(0); - } + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { f(0) = Kokkos::cos(y(0)) - y(0); } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = -Kokkos::sin(y(0)) - 1; - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = -Kokkos::sin(y(0)) - 1; } }; // Logarithmic equation @@ -202,9 +175,7 @@ struct LogarithmicEquation { f(0) = 7 * y(0) - Kokkos::log(7 * y(0)) - 1; } - KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { - jac(0, 0) = 7 - 1 / y(0); - } + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { jac(0, 0) = 7 - 1 / y(0); } }; template @@ -238,9 +209,8 @@ void test_newton_status() { #ifdef HAVE_KOKKOSKERNELS_DEBUG scalar_type solution[3] = {2.0, -1.0, 0.0}; #endif - newton_solver_status newton_status[3] = { - newton_solver_status::NLS_SUCCESS, newton_solver_status::NLS_DIVERGENCE, - newton_solver_status::LIN_SOLVE_FAIL}; + newton_solver_status newton_status[3] = {newton_solver_status::NLS_SUCCESS, newton_solver_status::NLS_DIVERGENCE, + newton_solver_status::LIN_SOLVE_FAIL}; vec_type x("solution vector", 1), rhs("right hand side vector", 1); auto x_h = Kokkos::create_mirror_view(x); auto r_h = Kokkos::create_mirror_view(rhs); @@ -253,8 +223,7 @@ void test_newton_status() { Kokkos::deep_copy(x, initial_value[idx]); Kokkos::RangePolicy my_policy(0, 1); - NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, - status, scale); + NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(status_h, status); @@ -263,10 +232,8 @@ void test_newton_status() { #ifdef HAVE_KOKKOSKERNELS_DEBUG Kokkos::deep_copy(x_h, x); Kokkos::deep_copy(r_h, rhs); - printf("Non-linear problem solution and residual with initial value %f:\n", - initial_value[idx]); - printf(" [%f, %g, %g]\n", x_h(0), r_h(0), - Kokkos::abs(x_h(0) - solution[idx]) / Kokkos::abs(solution[idx])); + printf("Non-linear problem solution and residual with initial value %f:\n", initial_value[idx]); + printf(" [%f, %g, %g]\n", x_h(0), r_h(0), Kokkos::abs(x_h(0) - solution[idx]) / Kokkos::abs(solution[idx])); #endif } } @@ -296,8 +263,7 @@ void test_simple_problems() { system_type mySys{}; scalar_type initial_value[2] = {1.0, -0.5}, solution[2] = {2.0, -1.0}; for (int idx = 0; idx < 2; ++idx) { - run_newton_test( - mySys, params, &(initial_value[idx]), &(solution[idx])); + run_newton_test(mySys, params, &(initial_value[idx]), &(solution[idx])); } #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Quadratic Equation problem" << std::endl; @@ -312,8 +278,7 @@ void test_simple_problems() { using system_type = TrigonometricEquation; system_type mySys{}; scalar_type initial_value[1] = {0.1}, solution[1] = {0.739085}; - run_newton_test(mySys, params, - initial_value, solution); + run_newton_test(mySys, params, initial_value, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Trigonometric Equation problem" << std::endl; #endif @@ -327,10 +292,8 @@ void test_simple_problems() { using system_type = LogarithmicEquation; system_type mySys{}; scalar_type initial_value[1] = {static_cast(0.5)}, - solution[1] = {static_cast(1.0) / - static_cast(7.0)}; - run_newton_test(mySys, params, - initial_value, solution); + solution[1] = {static_cast(1.0) / static_cast(7.0)}; + run_newton_test(mySys, params, initial_value, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Logarithmic Equation problem" << std::endl; #endif @@ -431,8 +394,7 @@ void test_simple_systems() { system_type mySys{}; scalar_type initial_values[2] = {1.5, 1.5}; scalar_type solution[2] = {10.75 / 6, 0.8887803753}; - run_newton_test(mySys, params, - initial_values, solution); + run_newton_test(mySys, params, initial_values, solution); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Circles Intersetcion problem" << std::endl; #endif @@ -441,8 +403,7 @@ void test_simple_systems() { { // Second problem: circle / hyperbola intersection #ifdef HAVE_KOKKOSKERNELS_DEBUG - std::cout << "\nStarting Circle/Hyperbola Intersetcion problem" - << std::endl; + std::cout << "\nStarting Circle/Hyperbola Intersetcion problem" << std::endl; #endif using system_type = CircleHyperbolaIntersection; system_type mySys{}; @@ -450,12 +411,9 @@ void test_simple_systems() { scalar_type init_vals[2] = {0.0, 1.0}; scalar_type solutions[2] = { Kokkos::ArithTraits::one() / - Kokkos::sqrt(static_cast( - 4 + Kokkos::sqrt(static_cast(12.0)) / 2)), - Kokkos::sqrt(static_cast( - (4 + Kokkos::sqrt(static_cast(12.0))) / 2))}; - run_newton_test(mySys, params, init_vals, - solutions); + Kokkos::sqrt(static_cast(4 + Kokkos::sqrt(static_cast(12.0)) / 2)), + Kokkos::sqrt(static_cast((4 + Kokkos::sqrt(static_cast(12.0))) / 2))}; + run_newton_test(mySys, params, init_vals, solutions); #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "Finished Circle/Hyperbola Intersetcion problem" << std::endl; #endif @@ -502,8 +460,7 @@ void test_newton_on_device() { mat_type J("jacobian", mySys.neqs * num_systems, mySys.neqs); mat_type tmp("temp mem", mySys.neqs * num_systems, mySys.neqs + 4); - Kokkos::View status("solver status", - num_systems); + Kokkos::View status("solver status", num_systems); auto x_h = Kokkos::create_mirror_view(x); auto r_h = Kokkos::create_mirror_view(rhs); @@ -517,8 +474,7 @@ void test_newton_on_device() { Kokkos::deep_copy(x, x_h); Kokkos::RangePolicy my_policy(0, num_systems); - NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status, scale); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::fence(); @@ -536,30 +492,14 @@ void test_newton_on_device() { // No ETI is performed for these device routines // Just pick scalar types at will... -TEST_F(TestCategory, Newton_status_float) { - ::Test::test_newton_status(); -} -TEST_F(TestCategory, Newton_status_double) { - ::Test::test_newton_status(); -} +TEST_F(TestCategory, Newton_status_float) { ::Test::test_newton_status(); } +TEST_F(TestCategory, Newton_status_double) { ::Test::test_newton_status(); } -TEST_F(TestCategory, Newton_simple_float) { - ::Test::test_simple_problems(); -} -TEST_F(TestCategory, Newton_simple_double) { - ::Test::test_simple_problems(); -} +TEST_F(TestCategory, Newton_simple_float) { ::Test::test_simple_problems(); } +TEST_F(TestCategory, Newton_simple_double) { ::Test::test_simple_problems(); } -TEST_F(TestCategory, Newton_system_float) { - ::Test::test_simple_systems(); -} -TEST_F(TestCategory, Newton_system_double) { - ::Test::test_simple_systems(); -} +TEST_F(TestCategory, Newton_system_float) { ::Test::test_simple_systems(); } +TEST_F(TestCategory, Newton_system_double) { ::Test::test_simple_systems(); } -TEST_F(TestCategory, Newton_parallel_float) { - ::Test::test_newton_on_device(); -} -TEST_F(TestCategory, Newton_parallel_double) { - ::Test::test_newton_on_device(); -} +TEST_F(TestCategory, Newton_parallel_float) { ::Test::test_newton_on_device(); } +TEST_F(TestCategory, Newton_parallel_double) { ::Test::test_newton_on_device(); } diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index c7d1a84865..90bec0e184 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -37,25 +37,17 @@ struct duho { const double a11 = 0, a12 = 1, a21, a22; duho(const double m_, const double c_, const double k_) - : m(m_), - c(c_), - k(k_), - d(k_ / m_ - (c_ * c_) / (4 * m_ * m_)), - a21(-k / m), - a22(-c / m){}; + : m(m_), c(c_), k(k_), d(k_ / m_ - (c_ * c_) / (4 * m_ * m_)), a21(-k / m), a22(-c / m){}; template - KOKKOS_FUNCTION void evaluate_function(const double /*t*/, - const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { f(0) = a11 * y(0) + a12 * y(1); f(1) = a21 * y(0) + a22 * y(1); } template - KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, - const vec_type& y) const { + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, const vec_type& y) const { using KAT = Kokkos::ArithTraits; const double gamma = c / (2 * m); @@ -64,8 +56,7 @@ struct duho { const double A = y0(0) / KAT::cos(phi); y(0) = A * KAT::cos(omega * t - phi) * KAT::exp(-t * gamma); - y(1) = -y(0) * gamma - - omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); + y(1) = -y(0) * gamma - omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); } }; // duho @@ -76,16 +67,14 @@ struct solution_wrapper { scalar_type t; vec_type y_old, y_ref; - solution_wrapper(const ode_type& ode_, const scalar_type t_, - const vec_type& y_old_, const vec_type& y_ref_) + solution_wrapper(const ode_type& ode_, const scalar_type t_, const vec_type& y_old_, const vec_type& y_ref_) : ode(ode_), t(t_), y_old(y_old_), y_ref(y_ref_){}; KOKKOS_FUNCTION void operator()(const int /*idx*/) const { ode.solution(t, y_old, y_ref); } }; -template +template struct RKSolve_wrapper { using ode_params = KokkosODE::Experimental::ODE_params; @@ -96,10 +85,9 @@ struct RKSolve_wrapper { vec_type y_old, y_new, tmp; mv_type kstack; - RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, - const scalar_type tstart_, const scalar_type tend_, - const vec_type& y_old_, const vec_type& y_new_, - const vec_type& tmp_, const mv_type& kstack_) + RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, const scalar_type tstart_, + const scalar_type tend_, const vec_type& y_old_, const vec_type& y_new_, const vec_type& tmp_, + const mv_type& kstack_) : my_ode(my_ode_), params(params_), tstart(tstart_), @@ -111,20 +99,15 @@ struct RKSolve_wrapper { KOKKOS_FUNCTION void operator()(const int /*idx*/) const { - KokkosODE::Experimental::RungeKutta::Solve( - my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + KokkosODE::Experimental::RungeKutta::Solve(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); } }; -template -void test_method(const std::string label, ode_type& my_ode, - const scalar_type& tstart, const scalar_type& tend, - const int num_steps, vec_type& y_old, vec_type& y_new, - const int order, const int num_stages, +template +void test_method(const std::string label, ode_type& my_ode, const scalar_type& tstart, const scalar_type& tend, + const int num_steps, vec_type& y_old, vec_type& y_new, const int order, const int num_stages, const Kokkos::View& ks, - const Kokkos::View& sol, - typename vec_type::HostMirror y_ref_h) { + const Kokkos::View& sol, typename vec_type::HostMirror y_ref_h) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; @@ -133,8 +116,8 @@ void test_method(const std::string label, ode_type& my_ode, mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper - solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, y_old, + y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror_view(y_new); @@ -155,19 +138,16 @@ void test_method(const std::string label, ode_type& my_ode, EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(stageIdx, 0), 1e-8); EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(stageIdx, 1), 1e-8); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - std::cout << " k" << stageIdx << "={" << kstack_h(stageIdx, 0) << ", " - << kstack_h(stageIdx, 1) << "}" << std::endl; + std::cout << " k" << stageIdx << "={" << kstack_h(stageIdx, 0) << ", " << kstack_h(stageIdx, 1) << "}" + << std::endl; #endif } EXPECT_NEAR_KK(sol(0), y_new_h(0), 1e-8); EXPECT_NEAR_KK(sol(1), y_new_h(1), 1e-8); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << " y={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; - std::cout << " error={" - << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) - << ", " - << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) - << "}" << std::endl; + std::cout << " error={" << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) << ", " + << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) << "}" << std::endl; #else (void)y_ref_h; #endif @@ -216,8 +196,7 @@ void test_RK() { Kokkos::deep_copy(y_ref_h, y_ref); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" - << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; #endif } @@ -230,9 +209,8 @@ void test_RK() { Kokkos::View ks(ks_raw, 2, 1); double sol_raw[2] = {1, -0.04}; Kokkos::View sol(sol_raw, 2); - test_method( - "Euler-Forward", my_oscillator, tstart, tend, 1, y_old, y_new, 1, 1, ks, - sol, y_ref_h); + test_method("Euler-Forward", my_oscillator, tstart, tend, 1, y_old, + y_new, 1, 1, ks, sol, y_ref_h); } { @@ -241,9 +219,8 @@ void test_RK() { Kokkos::View ks(ks_raw, 2, 2); double sol_raw[2] = {0.9998, -0.0398}; Kokkos::View sol(sol_raw, 2); - test_method( - "Euler-Heun", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 2, ks, - sol, y_ref_h); + test_method("Euler-Heun", my_oscillator, tstart, tend, 1, y_old, + y_new, 2, 2, ks, sol, y_ref_h); } { @@ -252,73 +229,59 @@ void test_RK() { Kokkos::View ks(ks_raw, 2, 3); double sol_raw[2] = {0.9998, -0.03979999}; Kokkos::View sol(sol_raw, 2); - test_method( - "RKF-12", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 3, ks, sol, - y_ref_h); + test_method("RKF-12", my_oscillator, tstart, tend, 1, y_old, y_new, + 2, 3, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, - -4, -3.98, -3.96955, -3.95940467}; + double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, -4, -3.98, -3.96955, -3.95940467}; Kokkos::View ks(ks_raw, 2, 4); double sol_raw[2] = {0.99980067, -0.039798}; Kokkos::View sol(sol_raw, 2); - test_method( - "RKBS", my_oscillator, tstart, tend, 1, y_old, y_new, 3, 4, ks, sol, - y_ref_h); + test_method("RKBS", my_oscillator, tstart, tend, 1, y_old, y_new, 3, + 4, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, - -0.03979499, -0.0199505, -4, -3.99, - -3.98491562, -3.96257222, -3.95941166, -3.97984883}; + double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, -0.03979499, -0.0199505, + -4, -3.99, -3.98491562, -3.96257222, -3.95941166, -3.97984883}; Kokkos::View ks(ks_raw, 2, 6); double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method( - "RKF-45", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, sol, - y_ref_h); + test_method("RKF-45", my_oscillator, tstart, tend, 1, y_old, y_new, + 5, 6, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, - -0.03979862, -0.03484563, -4, -3.992, - -3.987946, -3.97578551, -3.95940328, -3.96454357}; + double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, -0.03979862, -0.03484563, + -4, -3.992, -3.987946, -3.97578551, -3.95940328, -3.96454357}; Kokkos::View ks(ks_raw, 2, 6); double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method( - "Cash-Karp", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, - sol, y_ref_h); + test_method("Cash-Karp", my_oscillator, tstart, tend, 1, y_old, + y_new, 5, 6, ks, sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[14] = {0, -0.008, -0.011982, -0.03187008, - -0.03539333, -0.0397954, -0.03979801, -4, - -3.992, -3.987946, -3.96762048, -3.96398013, - -3.95941068, -3.95940467}; + double ks_raw[14] = {0, -0.008, -0.011982, -0.03187008, -0.03539333, -0.0397954, -0.03979801, + -4, -3.992, -3.987946, -3.96762048, -3.96398013, -3.95941068, -3.95940467}; Kokkos::View ks(ks_raw, 2, 7); double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method( - "Dormand-Prince", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 7, - ks, sol, y_ref_h); + test_method("Dormand-Prince", my_oscillator, tstart, tend, 1, y_old, + y_new, 5, 7, ks, sol, y_ref_h); } } // test_RK -template -void test_rate(ode_type& my_ode, const scalar_type& tstart, - const scalar_type& tend, - Kokkos::View num_steps, - typename vec_type::HostMirror& y_old_h, - typename vec_type::HostMirror& y_ref_h, - typename vec_type::HostMirror& error) { +template +void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& tend, + Kokkos::View num_steps, typename vec_type::HostMirror& y_old_h, + typename vec_type::HostMirror& y_ref_h, typename vec_type::HostMirror& error) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; @@ -334,8 +297,8 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, KokkosODE::Experimental::ODE_params params(num_steps(idx)); Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper - solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, + y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(y_new_h, y_new); @@ -343,8 +306,8 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, #if defined(HAVE_KOKKOSKERNELS_DEBUG) scalar_type dt = (tend - tstart) / num_steps(idx); - std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" - << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; + std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; #endif } @@ -399,67 +362,57 @@ void test_convergence_rate() { Kokkos::deep_copy(y_ref_h, y_ref); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" - << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; #endif } typename vec_type::HostMirror error("error", num_steps.extent(0)); - test_rate( - my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + test_rate(my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, + error); for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { double expected_ratio = - Kokkos::pow(num_steps(idx) / num_steps(idx + 1), - KokkosODE::Impl::ButcherTableau<1, 1>::order); + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), KokkosODE::Impl::ButcherTableau<1, 1>::order); double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.15); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / - Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio - << ", expected ratio: " << expected_ratio + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio << ", rel diff: " << rel_ratio_diff << std::endl; #endif } Kokkos::deep_copy(error, 0); - test_rate( - my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + test_rate(my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, + error); for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { double expected_ratio = - Kokkos::pow(num_steps(idx) / num_steps(idx + 1), - KokkosODE::Impl::ButcherTableau<2, 3>::order); + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), KokkosODE::Impl::ButcherTableau<2, 3>::order); double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / - Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio - << ", expected ratio: " << expected_ratio + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio << ", rel diff: " << rel_ratio_diff << std::endl; #endif } Kokkos::deep_copy(error, 0); - test_rate( - my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + test_rate(my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, + error); for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { double expected_ratio = - Kokkos::pow(num_steps(idx) / num_steps(idx + 1), - KokkosODE::Impl::ButcherTableau<4, 5>::order); + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), KokkosODE::Impl::ButcherTableau<4, 5>::order); double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / - Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio - << ", expected ratio: " << expected_ratio + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio << ", rel diff: " << rel_ratio_diff << std::endl; #endif } @@ -507,24 +460,19 @@ void test_adaptivity() { Kokkos::deep_copy(y_ref_h, y_ref); #if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" - << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; #endif } vec_type tmp("tmp vector", neqs); - mv_type kstack( - "k stack", - KokkosODE::Experimental::RungeKutta::num_stages(), neqs); + mv_type kstack("k stack", KokkosODE::Experimental::RungeKutta::num_stages(), neqs); Kokkos::RangePolicy my_policy(0, 1); - KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, - minStepSize); + KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, minStepSize); Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper - solve_wrapper(my_oscillator, params, tstart, tend, y_old, y_new, tmp, - kstack); + RKSolve_wrapper solve_wrapper(my_oscillator, params, tstart, tend, + y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -547,8 +495,7 @@ void test_adaptivity() { for (int idx = 0; idx < y_new_h.extent_int(0); ++idx) { #if defined(HAVE_KOKKOSKERNELS_DEBUG) - error = - Kokkos::abs(y_new_h(idx) - y_ref_h(idx)) / Kokkos::abs(y_ref_h(idx)); + error = Kokkos::abs(y_new_h(idx) - y_ref_h(idx)) / Kokkos::abs(y_ref_h(idx)); std::cout << error << " "; #endif EXPECT_NEAR_KK_REL(y_new_h(idx), y_ref_h(idx), 1e-7); diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 763f38a013..690e271c84 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -33,13 +33,11 @@ struct chem_model_1 { const double tstart, tend, T0, T1; - chem_model_1(const double tstart_ = 0, const double tend_ = 100, - const double T0_ = 300, const double T1_ = 800) + chem_model_1(const double tstart_ = 0, const double tend_ = 100, const double T0_ = 300, const double T1_ = 800) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 @@ -61,13 +59,11 @@ struct chem_model_2 { const double tstart, tend, T0, T1; - chem_model_2(const double tstart_ = 0, const double tend_ = 1200, - const double T0_ = 300, const double T1_ = 1000) + chem_model_2(const double tstart_ = 0, const double tend_ = 1200, const double T0_ = 300, const double T1_ = 1000) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, - const vec_type1& y, + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 @@ -116,9 +112,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper - solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -126,15 +121,11 @@ void test_chem() { #if defined(HAVE_KOKKOSKERNELS_DEBUG) const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; std::cout << "\nChem model 1" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend - << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 - << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" - << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" - << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; #endif } @@ -162,9 +153,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper - solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -172,17 +162,13 @@ void test_chem() { #if defined(HAVE_KOKKOSKERNELS_DEBUG) const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; std::cout << "\nChem model 2" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend - << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 - << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " - << y_old_h(2) << ", " << y_old_h(3) << ", " << y_old_h(4) << ", " - << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " - << y_new_h(2) << ", " << y_new_h(3) << ", " << y_new_h(4) << ", " - << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " << y_old_h(2) << ", " << y_old_h(3) << ", " + << y_old_h(4) << ", " << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " << y_new_h(2) << ", " << y_new_h(3) << ", " + << y_new_h(4) << ", " << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; #endif } } // test_chem diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index adfc336576..e4a2416010 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -61,8 +61,7 @@ inline void add_kokkos_configuration(bool verbose) { auto val = remove_unwanted_characters(line.substr(found + 1)); // Ignore line without value, for example a category name if (!val.empty()) { - benchmark::AddCustomContext( - remove_unwanted_characters(line.substr(0, found)), val); + benchmark::AddCustomContext(remove_unwanted_characters(line.substr(0, found)), val); } } } @@ -75,18 +74,13 @@ inline void add_version_info() { if (!GIT_BRANCH.empty()) { benchmark::AddCustomContext("GIT_BRANCH", std::string(GIT_BRANCH)); - benchmark::AddCustomContext("GIT_COMMIT_HASH", - std::string(GIT_COMMIT_HASH)); - benchmark::AddCustomContext("GIT_CLEAN_STATUS", - std::string(GIT_CLEAN_STATUS)); - benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", - std::string(GIT_COMMIT_DESCRIPTION)); - benchmark::AddCustomContext("GIT_COMMIT_DATE", - std::string(GIT_COMMIT_DATE)); + benchmark::AddCustomContext("GIT_COMMIT_HASH", std::string(GIT_COMMIT_HASH)); + benchmark::AddCustomContext("GIT_CLEAN_STATUS", std::string(GIT_CLEAN_STATUS)); + benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", std::string(GIT_COMMIT_DESCRIPTION)); + benchmark::AddCustomContext("GIT_COMMIT_DATE", std::string(GIT_COMMIT_DATE)); } if (!BENCHMARK_VERSION.empty()) { - benchmark::AddCustomContext("GOOGLE_BENCHMARK_VERSION", - std::string(BENCHMARK_VERSION)); + benchmark::AddCustomContext("GOOGLE_BENCHMARK_VERSION", std::string(BENCHMARK_VERSION)); } } @@ -117,20 +111,16 @@ inline void add_benchmark_context(bool verbose = false) { } template -inline auto register_benchmark(const char* name, FuncType func, - std::vector arg_names, - std::vector args, int repeat, - ArgsToCallOp&&... func_args) { +inline auto register_benchmark(const char* name, FuncType func, std::vector arg_names, + std::vector args, int repeat, ArgsToCallOp&&... func_args) { if (repeat > 0) { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime() ->Iterations(repeat); } else { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime(); @@ -138,20 +128,16 @@ inline auto register_benchmark(const char* name, FuncType func, } template -inline auto register_benchmark_real_time(const char* name, FuncType func, - std::vector arg_names, - std::vector args, int repeat, - ArgsToCallOp&&... func_args) { +inline auto register_benchmark_real_time(const char* name, FuncType func, std::vector arg_names, + std::vector args, int repeat, ArgsToCallOp&&... func_args) { if (repeat > 0) { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseRealTime() ->Iterations(repeat); } else { - return benchmark::RegisterBenchmark( - name, func, std::forward(func_args)...) + return benchmark::RegisterBenchmark(name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseRealTime(); diff --git a/perf_test/KokkosKernels_perf_test_instantiation.hpp b/perf_test/KokkosKernels_perf_test_instantiation.hpp index 6844922ddb..8a46754030 100644 --- a/perf_test/KokkosKernels_perf_test_instantiation.hpp +++ b/perf_test/KokkosKernels_perf_test_instantiation.hpp @@ -57,9 +57,7 @@ int main_instantiation(int argc, char** argv) { else if (params.use_sycl) device_id = params.use_sycl - 1; - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); + Kokkos::initialize(Kokkos::InitializationSettings().set_num_threads(num_threads).set_device_id(device_id)); Kokkos::print_configuration(std::cout); std::cout << '\n'; @@ -112,8 +110,7 @@ int main_instantiation(int argc, char** argv) { if (params.use_sycl) { #if defined(KOKKOS_ENABLE_SYCL) std::cout << "Running on SYCL backend.\n"; - KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, - params); + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); ran = true; #else std::cout << "ERROR: SYCL requested, but not available.\n"; diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index 1303b2370e..ec767c68f7 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -39,50 +39,49 @@ struct CommonInputParams { std::string list_common_options() { std::ostringstream common_options; - common_options - << "\t[Required] Backend: the available backends are:\n" + common_options << "\t[Required] Backend: the available backends are:\n" #ifdef KOKKOS_ENABLE_THREADS - << "\t\t'--threads [numThreads]'\n" + << "\t\t'--threads [numThreads]'\n" #endif #ifdef KOKKOS_ENABLE_OPENMP - << "\t\t'--openmp [numThreads]'\n" + << "\t\t'--openmp [numThreads]'\n" #endif #ifdef KOKKOS_ENABLE_CUDA - << "\t\t'--cuda [deviceIndex]'\n" + << "\t\t'--cuda [deviceIndex]'\n" #endif #ifdef KOKKOS_ENABLE_HIP - << "\t\t'--hip [deviceIndex]'\n" + << "\t\t'--hip [deviceIndex]'\n" #endif #ifdef KOKKOS_ENABLE_SYCL - << "\t\t'--sycl [deviceIndex]'\n" + << "\t\t'--sycl [deviceIndex]'\n" #endif #ifdef KOKKOS_ENABLE_SERIAL - << "\t\tIf no parallel backend is requested, Serial will be used.\n" + << "\t\tIf no parallel backend is requested, Serial will be used.\n" #endif - << "\n" - << "\t The following backends are not available because Kokkos was not " - "configured with them:\n" + << "\n" + << "\t The following backends are not available because Kokkos was not " + "configured with them:\n" #ifndef KOKKOS_ENABLE_THREADS - << "\t\t'--threads [numThreads]'\n" + << "\t\t'--threads [numThreads]'\n" #endif #ifndef KOKKOS_ENABLE_OPENMP - << "\t\t'--openmp [numThreads]'\n" + << "\t\t'--openmp [numThreads]'\n" #endif #ifndef KOKKOS_ENABLE_CUDA - << "\t\t'--cuda [deviceIndex]'\n" + << "\t\t'--cuda [deviceIndex]'\n" #endif #ifndef KOKKOS_ENABLE_HIP - << "\t\t'--hip [deviceIndex]'\n" + << "\t\t'--hip [deviceIndex]'\n" #endif #ifndef KOKKOS_ENABLE_SYCL - << "\t\t'--sycl [deviceIndex]'\n" + << "\t\t'--sycl [deviceIndex]'\n" #endif #ifndef KOKKOS_ENABLE_SERIAL - << "\t\tSerial is not enabled so a parallel backend must be selected.\n" + << "\t\tSerial is not enabled so a parallel backend must be selected.\n" #endif - << "\n" - << "\t[Optional]:\n" - << "\t\t'-h', '--help': show available options\n\n"; + << "\n" + << "\t[Optional]:\n" + << "\t\t'-h', '--help': show available options\n\n"; return common_options.str(); } @@ -94,15 +93,13 @@ void process_arg_int(char const* str_val, int& val) { if (str_val == ptr_end) { std::stringstream ss; - ss << "Error: cannot convert command line argument '" << str_val - << "' to an integer.\n"; + ss << "Error: cannot convert command line argument '" << str_val << "' to an integer.\n"; throw std::invalid_argument(ss.str()); } if (errno == ERANGE) { std::stringstream ss; - ss << "Error: converted value for command line argument '" << str_val - << "' falls out of range.\n"; + ss << "Error: converted value for command line argument '" << str_val << "' falls out of range.\n"; throw std::invalid_argument(ss.str()); } } @@ -114,21 +111,18 @@ void process_arg_double(char const* str_val, double& val) { if (str_val == ptr_end) { std::stringstream ss; - ss << "Error: cannot convert command line argument '" << str_val - << "' to a double.\n"; + ss << "Error: cannot convert command line argument '" << str_val << "' to a double.\n"; throw std::invalid_argument(ss.str()); } if (errno == ERANGE) { std::stringstream ss; - ss << "Error: converted value for command line argument '" << str_val - << "' falls out of range.\n"; + ss << "Error: converted value for command line argument '" << str_val << "' falls out of range.\n"; throw std::invalid_argument(ss.str()); } } -bool check_arg_int(int const i, int const argc, char** argv, char const* name, - int& val) { +bool check_arg_int(int const i, int const argc, char** argv, char const* name, int& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -143,8 +137,7 @@ bool check_arg_int(int const i, int const argc, char** argv, char const* name, return true; } -bool check_arg_double(int const i, int const argc, char** argv, - char const* name, double& val) { +bool check_arg_double(int const i, int const argc, char** argv, char const* name, double& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -159,8 +152,7 @@ bool check_arg_double(int const i, int const argc, char** argv, return true; } -bool check_arg_bool(int const i, int const /*argc*/, char** argv, - char const* name, bool& val) { +bool check_arg_bool(int const i, int const /*argc*/, char** argv, char const* name, bool& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -168,8 +160,7 @@ bool check_arg_bool(int const i, int const /*argc*/, char** argv, return true; } -bool check_arg_str(int const i, int const argc, char** argv, char const* name, - std::string& val) { +bool check_arg_str(int const i, int const argc, char** argv, char const* name, std::string& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { return false; } @@ -198,8 +189,7 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { int remove_flags = 0; if (check_arg_int(argIdx, argc, argv, "--threads", params.use_threads)) { remove_flags = 2; - } else if (check_arg_int(argIdx, argc, argv, "--openmp", - params.use_openmp)) { + } else if (check_arg_int(argIdx, argc, argv, "--openmp", params.use_openmp)) { remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--cuda", params.use_cuda)) { params.use_cuda++; @@ -213,8 +203,7 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { } else if (check_arg_int(argIdx, argc, argv, "--repeat", params.repeat)) { remove_flags = 2; } else if (check_arg_bool(argIdx, argc, argv, "-h", params.print_help) || - check_arg_bool(argIdx, argc, argv, "--help", - params.print_help)) { + check_arg_bool(argIdx, argc, argv, "--help", params.print_help)) { remove_flags = 1; } diff --git a/perf_test/PerfTestUtilities.cpp b/perf_test/PerfTestUtilities.cpp index c403d0213d..479d50d2ba 100644 --- a/perf_test/PerfTestUtilities.cpp +++ b/perf_test/PerfTestUtilities.cpp @@ -23,8 +23,6 @@ namespace test { std::string inputDataPath; -void set_input_data_path(const std::string& path_to_data) { - inputDataPath = path_to_data; -} +void set_input_data_path(const std::string& path_to_data) { inputDataPath = path_to_data; } std::string get_input_data_path() { return inputDataPath; } } // namespace test diff --git a/perf_test/PerfTestUtilities.hpp b/perf_test/PerfTestUtilities.hpp index 4de10312b6..f6531a76fb 100644 --- a/perf_test/PerfTestUtilities.hpp +++ b/perf_test/PerfTestUtilities.hpp @@ -36,8 +36,7 @@ std::string get_input_data_path(); namespace KokkosSparse { -template +template class CrsMatrix; } @@ -62,8 +61,7 @@ inline std::vector get_directories(std::string path) { std::string nname = std::string(dir->d_name); // Check to see if item is a directory // if (isDirectory(path + '/' + nname)) - if (nname != "." && nname != ".." && - isDirectory(path + '/' + dir->d_name)) + if (nname != "." && nname != ".." && isDirectory(path + '/' + dir->d_name)) // std::vector::emplace_back: insert a new element to the end of vector paths.emplace_back(dir->d_name); } @@ -75,18 +73,16 @@ inline std::vector get_directories(std::string path) { namespace readers { template -using matrix_type = - KokkosSparse::CrsMatrix; +using matrix_type = KokkosSparse::CrsMatrix; template struct test_reader; template struct test_reader> { - static matrix_type read( - const std::string &filename) { - return KokkosKernels::Impl::read_kokkos_crst_matrix< - matrix_type>(filename.c_str()); + static matrix_type read(const std::string &filename) { + return KokkosKernels::Impl::read_kokkos_crst_matrix>( + filename.c_str()); } }; @@ -100,30 +96,23 @@ struct data_retriever { std::tuple test_data; }; std::vector test_cases; - std::string make_full_path_to_data_file(std::string repo, - std::string path_to_data, - std::string dataset, + std::string make_full_path_to_data_file(std::string repo, std::string path_to_data, std::string dataset, std::string filename) { - return root_path + "/" + repo + "/" + path_to_data + dataset + "/" + - filename; + return root_path + "/" + repo + "/" + path_to_data + dataset + "/" + filename; } template - data_retriever(std::string path_to_data, Locations... locations) - : sub_path(path_to_data) { + data_retriever(std::string path_to_data, Locations... locations) : sub_path(path_to_data) { root_path = test::get_input_data_path(); // TODO: way to list the directories in the root path std::vector data_repos = get_directories(root_path + "/"); // TODO: list directories in subpaths for (auto repo : data_repos) { - std::vector datasets = - get_directories(root_path + "/" + repo + "/" + path_to_data + "/"); + std::vector datasets = get_directories(root_path + "/" + repo + "/" + path_to_data + "/"); for (auto dataset : datasets) { - test_cases.push_back( - test_case{repo + "/" + dataset, - std::make_tuple(readers::test_reader::read( - make_full_path_to_data_file( - repo, path_to_data, dataset, locations))...)}); + test_cases.push_back(test_case{repo + "/" + dataset, + std::make_tuple(readers::test_reader::read( + make_full_path_to_data_file(repo, path_to_data, dataset, locations))...)}); } } } diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp index 53a6f8f173..5081017e46 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockJacobi_Tutorial.cpp @@ -50,8 +50,8 @@ using member_type = typename policy_type::member_type; using namespace KokkosBatched; template -val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, - const ManyVectorType &b, const ManyVectorType &r) { +val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, const ManyVectorType &b, + const ManyVectorType &r) { /// compute residual val_type residual(0); { @@ -66,17 +66,12 @@ val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, auto xx = Kokkos::subview(x, i, Kokkos::ALL()); auto rr = Kokkos::subview(r, i, Kokkos::ALL()); - TeamGemv::invoke(member, -one, AA, xx, one, - rr); + TeamGemv::invoke(member, -one, AA, xx, one, rr); val_type sum(0); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member, rr.extent(0)), - [&](const int &k, val_type &lsum) { - lsum += Kokkos::ArithTraits::abs(rr(k)); - }, - sum); + [&](const int &k, val_type &lsum) { lsum += Kokkos::ArithTraits::abs(rr(k)); }, sum); Kokkos::single(Kokkos::PerTeam(member), [&]() { update += sum; }); }, residual); @@ -132,8 +127,8 @@ struct Task1SolveLowerTriangular { const val_type one(1); auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, one, TT, AA); + TeamTrsm::invoke( + member, one, TT, AA); } }; @@ -152,9 +147,8 @@ struct Task1SolveUpperTriangular { const val_type one(1); auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, one, TT, - AA); + TeamTrsm::invoke( + member, one, TT, AA); } }; } // namespace ConstructBlockJacobi @@ -176,8 +170,7 @@ struct Task1ApplyBlockJacobi { auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); - TeamGemv::invoke( - member, one, AA, bb, zero, xx); + TeamGemv::invoke(member, one, AA, bb, zero, xx); } }; @@ -200,11 +193,10 @@ struct Task2FactorizeInvert { TeamLU::invoke(member, AA); TeamCopy::invoke(member, AA, TT); TeamSetIdentity::invoke(member, AA); - TeamTrsm::invoke(member, one, TT, AA); - TeamTrsm::invoke(member, one, TT, - AA); + TeamTrsm::invoke( + member, one, TT, AA); + TeamTrsm::invoke( + member, one, TT, AA); } }; @@ -225,8 +217,7 @@ struct Task2ApplyBlockJacobi { auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); - TeamGemv::invoke( - member, one, AA, bb, zero, xx); + TeamGemv::invoke(member, one, AA, bb, zero, xx); } }; @@ -260,22 +251,17 @@ int main(int argc, char *argv[]) { /// x - solution vector /// b - right hand side vector /// - Kokkos::View A( - "block diagonals", N, Blk, Blk); - Kokkos::View T( - "temporal block diagonals", N, Blk, Blk); - Kokkos::View x("x", N, - Blk); - Kokkos::View b("b", N, - Blk); + Kokkos::View A("block diagonals", N, Blk, Blk); + Kokkos::View T("temporal block diagonals", N, Blk, Blk); + Kokkos::View x("x", N, Blk); + Kokkos::View b("b", N, Blk); /// copy of A to check residual - Kokkos::View Acopy( - "Acopy", A.extent(0), A.extent(1), A.extent(2)); + Kokkos::View Acopy("Acopy", A.extent(0), A.extent(1), + A.extent(2)); /// residual vector - Kokkos::View r( - "r", b.extent(0), b.extent(1)); + Kokkos::View r("r", b.extent(0), b.extent(1)); /// The block diagonal matrices are assumed to be extracted from a block /// sparse matrix. Here we set the blocks with random values @@ -308,23 +294,15 @@ int main(int argc, char *argv[]) { { policy_type policy(A.extent(0), Kokkos::AUTO()); timer.reset(); - Kokkos::parallel_for( - "task1.factorize", policy, - ConstructBlockJacobi::Task1Factorize(A)); + Kokkos::parallel_for("task1.factorize", policy, ConstructBlockJacobi::Task1Factorize(A)); Kokkos::deep_copy(T, A); - Kokkos::parallel_for( - "task1.set-identity", policy, - ConstructBlockJacobi::Task1SetIdentity(A)); + Kokkos::parallel_for("task1.set-identity", policy, ConstructBlockJacobi::Task1SetIdentity(A)); Kokkos::fence(); - Kokkos::parallel_for( - "task1.solve-lower-triangular", policy, - ConstructBlockJacobi::Task1SolveLowerTriangular(A, T)); + Kokkos::parallel_for("task1.solve-lower-triangular", policy, + ConstructBlockJacobi::Task1SolveLowerTriangular(A, T)); Kokkos::fence(); - Kokkos::parallel_for( - "task1.solve-upper-triangular", policy, - ConstructBlockJacobi::Task1SolveUpperTriangular(A, T)); + Kokkos::parallel_for("task1.solve-upper-triangular", policy, + ConstructBlockJacobi::Task1SolveUpperTriangular(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -337,10 +315,8 @@ int main(int argc, char *argv[]) { { timer.reset(); policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for( - "task1.apply-block-jacobi", policy, - Task1ApplyBlockJacobi(A, x, - b)); + Kokkos::parallel_for("task1.apply-block-jacobi", policy, + Task1ApplyBlockJacobi(A, x, b)); const double t = timer.seconds(); printf( "task 1: application of jacobi time = %f , # of applications per " @@ -374,9 +350,7 @@ int main(int argc, char *argv[]) { { policy_type policy(A.extent(0), Kokkos::AUTO()); timer.reset(); - Kokkos::parallel_for( - "task2.factorize-invert", policy, - Task2FactorizeInvert(A, T)); + Kokkos::parallel_for("task2.factorize-invert", policy, Task2FactorizeInvert(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -389,10 +363,8 @@ int main(int argc, char *argv[]) { { timer.reset(); policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for( - "task2.apply-block-jacobi", policy, - Task2ApplyBlockJacobi(A, x, - b)); + Kokkos::parallel_for("task2.apply-block-jacobi", policy, + Task2ApplyBlockJacobi(A, x, b)); const double t = timer.seconds(); printf( "task 2: application of jacobi time = %f , # of applications per " diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index f3eb0dd8ac..810112baa3 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -66,11 +66,9 @@ using member_type = typename policy_type::member_type; /// using namespace KokkosBatched; -static constexpr int vector_length = - DefaultVectorLength::value; +static constexpr int vector_length = DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = - DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif @@ -169,15 +167,11 @@ struct SetTridiagToIdentity { KOKKOS_INLINE_FUNCTION void operator()(const member_type &member) const { const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, __AA.extent(5)), - [&](const int &v) { - for (int k = 0, kend = __AA.extent(3); k < kend; ++k) - __AA(i, j, 1, k, k, v) = 1; - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { + for (int k = 0, kend = __AA.extent(3); k < kend; ++k) __AA(i, j, 1, k, k, v) = 1; + }); + }); } }; @@ -192,46 +186,42 @@ struct Factorize { KOKKOS_INLINE_FUNCTION void operator()(const member_type &member) const { - typedef FactorizeModeAndAlgo - default_mode_and_algo_type; + typedef FactorizeModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { - auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL(), v); - - /// subview patterns - auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - - if (__L == 1) { - A.assign_data(&AAA(0, 1, 0, 0)); - LU::invoke(member, A); - } else { - for (int k = 0; k < (__L - 1); ++k) { - A.assign_data(&AAA(k, 1, 0, 0)); - B.assign_data(&AAA(k, 2, 0, 0)); - C.assign_data(&AAA(k, 0, 0, 0)); - D.assign_data(&AAA(k + 1, 1, 0, 0)); - - LU::invoke(member, A); - Trsm::invoke(member, 1.0, A, B); - Trsm::invoke(member, 1.0, A, - C); - Gemm::invoke(member, -1.0, C, B, 1.0, D); - } - LU::invoke(member, D); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { + auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), v); + + /// subview patterns + auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + + if (__L == 1) { + A.assign_data(&AAA(0, 1, 0, 0)); + LU::invoke(member, A); + } else { + for (int k = 0; k < (__L - 1); ++k) { + A.assign_data(&AAA(k, 1, 0, 0)); + B.assign_data(&AAA(k, 2, 0, 0)); + C.assign_data(&AAA(k, 0, 0, 0)); + D.assign_data(&AAA(k + 1, 1, 0, 0)); + + LU::invoke(member, A); + Trsm::invoke( + member, 1.0, A, B); + Trsm::invoke( + member, 1.0, A, C); + Gemm::invoke(member, -1.0, C, B, + 1.0, D); + } + LU::invoke(member, D); + } + }); } }; @@ -275,58 +265,46 @@ int main(int argc, char *argv[]) { /// /// double 16 - Kokkos::View Av( - "A", N / vector_length, L, 3, Blk, Blk); + Kokkos::View Av("A", N / vector_length, L, 3, Blk, Blk); /// double Kokkos::View As( - (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), - Av.extent(3), Av.extent(4), vector_length); + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View - Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), - Av.extent(2), Av.extent(3), Av.extent(4), - vector_length / internal_vector_length); + Kokkos::View Ai( + (internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv( - "x", N / vector_length, Nvec, L, Blk); + Kokkos::View xv("x", N / vector_length, Nvec, L, Blk); /// double Kokkos::View xs( - (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), - xv.extent(3), vector_length); + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length); /// double 2 - Kokkos::View - xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), - xv.extent(2), xv.extent(3), vector_length / internal_vector_length); + Kokkos::View xi( + (internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), + vector_length / internal_vector_length); /// double 16 - Kokkos::View bv( - "b", N / vector_length, Nvec, L, Blk); + Kokkos::View bv("b", N / vector_length, Nvec, L, Blk); /// double Kokkos::View bs( - (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), - bv.extent(3), vector_length); + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length); /// double 2 - Kokkos::View - bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), - bv.extent(2), bv.extent(3), vector_length / internal_vector_length); + Kokkos::View bi( + (internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), + vector_length / internal_vector_length); /// double copy of A Kokkos::View Acopy( - "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), - As.extent(4), As.extent(5)); + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), As.extent(4), As.extent(5)); - Kokkos::View rs( - "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), - bs.extent(4)); + Kokkos::View rs("rs", bs.extent(0), bs.extent(1), + bs.extent(2), bs.extent(3), bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -347,8 +325,7 @@ int main(int argc, char *argv[]) { #endif timer.reset(); policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for("setTridiagToIdentity", policy, - SetTridiagToIdentity(AA)); + Kokkos::parallel_for("setTridiagToIdentity", policy, SetTridiagToIdentity(AA)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -385,16 +362,14 @@ int main(int argc, char *argv[]) { } policy_type policy(AA.extent(0), team_size, AA.extent(5)); - Kokkos::parallel_for("factorize", - policy.set_scratch_size(0, Kokkos::PerTeam(S)), + Kokkos::parallel_for("factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), Factorize(AA, L)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("factorize time = %f , # of factorization per min = %f \n", t, - 1.0 / t * 60); + printf("factorize time = %f , # of factorization per min = %f \n", t, 1.0 / t * 60); } /// @@ -417,121 +392,96 @@ int main(int argc, char *argv[]) { policy_type policy(AA.extent(0), team_size, AA.extent(5)); for (int iter = 0; iter < niter; ++iter) { Kokkos::parallel_for( - "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo - default_mode_and_algo_type; + "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) { + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec = 0; jvec < Nvec; ++jvec) { - auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - - auto xt = Kokkos::subview(x, 0, Kokkos::ALL()); - auto xb = Kokkos::subview(x, 0, Kokkos::ALL()); - - /// - /// forward substitution - /// - { - // const bool is_same_x_and_b = (x.data() == b.data()); - auto LT = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - auto LB = - Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL()); - - auto bk = Kokkos::subview(b, 0, Kokkos::ALL()); - { - { // if (!is_same_x_and_b) { - Copy::invoke(member, bk, xb); - member.team_barrier(); - } - } - const int kend = L - 1; - for (int k = 0; k < kend; ++k) { - LT.assign_data(&A(k, 0, 0)); - LB.assign_data(&C(k, 0, 0)); - - xt.assign_data(&x(k, 0)); - xb.assign_data(&x(k + 1, 0)); - - { // if (!is_same_x_and_b) { - bk.assign_data(&b(k + 1, 0)); - Copy::invoke(member, bk, xb); - } - - Trsv::invoke(member, - 1.0, - LT, - xt); - - Gemv::invoke(member, -1.0, LB, xt, 1.0, - xb); - } - { - LT.assign_data(&A(kend, 0, 0)); - xt.assign_data(&x(kend, 0)); - Trsv::invoke(member, - 1.0, - LT, - xt); - } - } /// end forward substitution - - /// - /// backward substitution - /// - { - auto UT = - Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); - auto UB = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - - const int kbegin = L - 1; - for (int k = kbegin; k > 0; --k) { - UT.assign_data(&B(k - 1, 0, 0)); - UB.assign_data(&A(k, 0, 0)); - - xt.assign_data(&x(k - 1, 0)); - xb.assign_data(&x(k, 0)); - - Trsv::invoke(member, 1.0, UB, xb); - - Gemv::invoke(member, -1.0, UT, xb, 1.0, - xt); - } - { - UB.assign_data(&A(0, 0, 0)); - xb.assign_data(&x(0, 0)); - Trsv::invoke(member, 1.0, UB, xb); - } - } // end backward substitution + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + + auto xt = Kokkos::subview(x, 0, Kokkos::ALL()); + auto xb = Kokkos::subview(x, 0, Kokkos::ALL()); + + /// + /// forward substitution + /// + { + // const bool is_same_x_and_b = (x.data() == b.data()); + auto LT = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto LB = Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL()); + + auto bk = Kokkos::subview(b, 0, Kokkos::ALL()); + { + { // if (!is_same_x_and_b) { + Copy::invoke(member, bk, xb); + member.team_barrier(); + } + } + const int kend = L - 1; + for (int k = 0; k < kend; ++k) { + LT.assign_data(&A(k, 0, 0)); + LB.assign_data(&C(k, 0, 0)); + + xt.assign_data(&x(k, 0)); + xb.assign_data(&x(k + 1, 0)); + + { // if (!is_same_x_and_b) { + bk.assign_data(&b(k + 1, 0)); + Copy::invoke(member, bk, xb); + } + + Trsv::invoke( + member, 1.0, LT, xt); + + Gemv::invoke(member, -1.0, LB, xt, 1.0, + xb); + } + { + LT.assign_data(&A(kend, 0, 0)); + xt.assign_data(&x(kend, 0)); + Trsv::invoke( + member, 1.0, LT, xt); + } + } /// end forward substitution + + /// + /// backward substitution + /// + { + auto UT = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); + auto UB = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + + const int kbegin = L - 1; + for (int k = kbegin; k > 0; --k) { + UT.assign_data(&B(k - 1, 0, 0)); + UB.assign_data(&A(k, 0, 0)); + + xt.assign_data(&x(k - 1, 0)); + xb.assign_data(&x(k, 0)); + + Trsv::invoke( + member, 1.0, UB, xb); + + Gemv::invoke(member, -1.0, UT, xb, 1.0, + xt); } - }); + { + UB.assign_data(&A(0, 0, 0)); + xb.assign_data(&x(0, 0)); + Trsv::invoke( + member, 1.0, UB, xb); + } + } // end backward substitution + } + }); }); Kokkos::fence(); } @@ -539,8 +489,7 @@ int main(int argc, char *argv[]) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, - 1.0 / t * 60 * niter); + printf("solve time = %f , # of solves per min = %f\n", t, 1.0 / t * 60 * niter); } /// @@ -552,114 +501,77 @@ int main(int argc, char *argv[]) { Kokkos::parallel_for( "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) { const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, Acopy.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; - ++jvec) { - auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - - if (L == 1) { - auto A0 = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); - auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); - auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); - - TeamCopy::invoke(member, - b0, r0); - TeamGemv::invoke(member, -1.0, A0, x0, 1.0, - r0); - } else { - int k = 0; - { - /// first row - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke( - member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, -1.0, A1, x1, 1.0, - rk); - TeamGemv::invoke(member, -1.0, B2, x2, 1.0, - rk); - ++k; - } - for (; k < (L - 1); ++k) { - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke( - member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, -1.0, C0, x0, 1.0, - rk); - TeamGemv::invoke(member, -1.0, A1, x1, 1.0, - rk); - TeamGemv::invoke(member, -1.0, B2, x2, 1.0, - rk); - } - { - // last row - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke( - member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, -1.0, C0, x0, 1.0, - rk); - TeamGemv::invoke(member, -1.0, A1, x1, 1.0, - rk); - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, b0, r0); + TeamGemv::invoke(member, -1.0, A0, x0, 1.0, r0); + } else { + int k = 0; + { + /// first row + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, rk); + ++k; } - }); + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, rk); + } + } + } + }); }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); @@ -669,13 +581,11 @@ int main(int argc, char *argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; - ++i0) // N/vector_length - for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec - for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L - for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk - for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; - ++i4) { // vector_length + for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; ++i0) // N/vector_length + for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec + for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L + for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk + for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; ++i4) { // vector_length const auto val = bs_host(i0, i1, i2, i3, i4); const auto res = rs_host(i0, i1, i2, i3, i4); norm2 += val * val; diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 67a141578e..629c73924e 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -72,11 +72,9 @@ typedef double value_type; /// using namespace KokkosBatched; -static constexpr int vector_length = - DefaultVectorLength::value; +static constexpr int vector_length = DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = - DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif @@ -98,20 +96,17 @@ struct InverseDiagonalsModeAndAlgoHostImpl { #if defined(KOKKOS_ENABLE_SERIAL) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoHostImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoHostImpl {}; #endif #if defined(KOKKOS_ENABLE_THREADS) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoHostImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoHostImpl {}; #endif #if defined(KOKKOS_ENABLE_ONPENMP) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoHostImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoHostImpl {}; #endif struct InverseDiagonalsModeAndAlgoDeviceImpl { @@ -121,14 +116,12 @@ struct InverseDiagonalsModeAndAlgoDeviceImpl { #if defined(KOKKOS_ENABLE_CUDA) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoDeviceImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif #if defined(KOKKOS_ENABLE_HIP) template <> -struct InverseDiagonalsModeAndAlgo - : InverseDiagonalsModeAndAlgoDeviceImpl {}; +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif template @@ -211,56 +204,46 @@ int main(int argc, char *argv[]) { /// /// double 16 - Kokkos::View Av( - "A", N / vector_length, L, 4, Blk, Blk); + Kokkos::View Av("A", N / vector_length, L, 4, Blk, Blk); /// double Kokkos::View As( - (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), - Av.extent(3), Av.extent(4), vector_length); + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View - Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), - Av.extent(2), Av.extent(3), Av.extent(4), - vector_length / internal_vector_length); + Kokkos::View Ai( + (internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv( - "x", N / vector_length, Nvec, 2, L, Blk); + Kokkos::View xv("x", N / vector_length, Nvec, 2, L, Blk); /// double Kokkos::View xs( - (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), - xv.extent(3), xv.extent(4), vector_length); + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), xv.extent(4), vector_length); /// double 2 - Kokkos::View - xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), - xv.extent(2), xv.extent(3), xv.extent(4), - vector_length / internal_vector_length); + Kokkos::View xi( + (internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), xv.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View bv( - "b", N / vector_length, Nvec, L, Blk); + Kokkos::View bv("b", N / vector_length, Nvec, L, Blk); /// double Kokkos::View bs( - (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), - bv.extent(3), vector_length); + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length); /// double 2 - Kokkos::View - bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), - bv.extent(2), bv.extent(3), vector_length / internal_vector_length); + Kokkos::View bi( + (internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), + vector_length / internal_vector_length); /// double copy of A Kokkos::View Acopy( - "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), - As.extent(4), As.extent(5)); + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), As.extent(4), As.extent(5)); - Kokkos::View rs( - "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), - bs.extent(4)); + Kokkos::View rs("rs", bs.extent(0), bs.extent(1), bs.extent(2), + bs.extent(3), bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -288,18 +271,13 @@ int main(int argc, char *argv[]) { using member_type = typename policy_type::member_type; policy_type policy(AA.extent(0) * L, Kokkos::AUTO(), AA.extent(5)); Kokkos::parallel_for( - "diagonal dominant", policy, - KOKKOS_LAMBDA(const member_type &member) { + "diagonal dominant", policy, KOKKOS_LAMBDA(const member_type &member) { const int i = member.league_rank() / L; const int k = member.league_rank() % L; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, Blk), [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - AA(i, k, 1, j, j, v) += internal_vector_type(9 * Blk); - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, Blk), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { AA(i, k, 1, j, j, v) += internal_vector_type(9 * Blk); }); + }); }); Kokkos::fence(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -318,16 +296,14 @@ int main(int argc, char *argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - const int per_team_scratch = - scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); - int team_size = 0; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + const int per_team_scratch = scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -338,49 +314,37 @@ int main(int argc, char *argv[]) { policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); Kokkos::parallel_for( - "inverse diagonals", - policy.set_scratch_size( - 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + "inverse diagonals", policy.set_scratch_size(0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef InverseDiagonalsModeAndAlgo - default_mode_and_algo_type; + typedef InverseDiagonalsModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; const int i = member.league_rank() / L; const int k = member.league_rank() % L; - scratch_view_type WW(member.team_scratch(0), Blk, Blk, - AA.extent(5)); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), - Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), - Kokkos::ALL(), v); - auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); - - Copy::invoke( - member, A, W); - SetIdentity::invoke(member, D); - member.team_barrier(); - LU::invoke(member, W); - Trsm::invoke(member, 1.0, W, - D); - Trsm::invoke(member, 1.0, - W, D); - }); + scratch_view_type WW(member.team_scratch(0), Blk, Blk, AA.extent(5)); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); + auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); + + Copy::invoke(member, A, W); + SetIdentity::invoke(member, D); + member.team_barrier(); + LU::invoke(member, W); + Trsm::invoke( + member, 1.0, W, D); + Trsm::invoke(member, 1.0, W, D); + }); }); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("inverse time = %f , # of inverse per min = %f \n", t, - 1.0 / t * 60); + printf("inverse time = %f , # of inverse per min = %f \n", t, 1.0 / t * 60); } /// @@ -392,12 +356,10 @@ int main(int argc, char *argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - const int per_team_scratch = - scratch_view_type::shmem_size(Blk, AA.extent(5)); + const int per_team_scratch = scratch_view_type::shmem_size(Blk, AA.extent(5)); using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; @@ -412,78 +374,53 @@ int main(int argc, char *argv[]) { policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); for (int iter = 0; iter < niter; ++iter) { - auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); for (int nis = 0; nis < nsweep; ++nis) { Kokkos::parallel_for( - "solve", - policy.set_scratch_size( - 0, - Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo - default_mode_and_algo_type; + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5)); const int i = member.league_rank() / L; //%AA.extent(0); const int k = member.league_rank() % L; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), - Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), - Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), - Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, k ? k - 1 : 0, 0, - Kokkos::ALL(), Kokkos::ALL(), v); - auto u = Kokkos::subview(WW, Kokkos::ALL(), v); - for (int jvec = 0; jvec < Nvec; ++jvec) { - auto x0 = Kokkos::subview( - xxx, i, jvec, k == 0 ? 0 : k - 1, Kokkos::ALL(), v); - auto x1 = - Kokkos::subview(xxx, i, jvec, k, Kokkos::ALL(), v); - auto x2 = Kokkos::subview(xxx, i, jvec, - k == L - 1 ? 0 : k + 1, - Kokkos::ALL(), v); - auto y1 = - Kokkos::subview(yyy, i, jvec, k, Kokkos::ALL(), v); - auto b = - Kokkos::subview(bb, i, jvec, k, Kokkos::ALL(), v); - - if (L == 1) { - Gemv::invoke(member, 1.0, D, b, 0.0, x1); - } else { - Copy::invoke(member, b, u); - if (k == 0) { - Gemv::invoke(member, -1.0, B, x2, 1.0, - u); - } else if (k == L - 1) { - Gemv::invoke(member, -1.0, C, x0, 1.0, - u); - } else { - Gemv::invoke(member, -1.0, B, x2, 1.0, - u); - Gemv::invoke(member, -1.0, C, x0, 1.0, - u); - } - Gemv::invoke(member, 1.0, D, u, 0.0, y1); - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, k ? k - 1 : 0, 0, Kokkos::ALL(), Kokkos::ALL(), v); + auto u = Kokkos::subview(WW, Kokkos::ALL(), v); + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x0 = Kokkos::subview(xxx, i, jvec, k == 0 ? 0 : k - 1, Kokkos::ALL(), v); + auto x1 = Kokkos::subview(xxx, i, jvec, k, Kokkos::ALL(), v); + auto x2 = Kokkos::subview(xxx, i, jvec, k == L - 1 ? 0 : k + 1, Kokkos::ALL(), v); + auto y1 = Kokkos::subview(yyy, i, jvec, k, Kokkos::ALL(), v); + auto b = Kokkos::subview(bb, i, jvec, k, Kokkos::ALL(), v); + + if (L == 1) { + Gemv::invoke(member, 1.0, D, b, 0.0, x1); + } else { + Copy::invoke(member, b, u); + if (k == 0) { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + } else if (k == L - 1) { + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); + } else { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); } - }); + Gemv::invoke(member, 1.0, D, u, 0.0, y1); + } + } + }); }); auto tmp = xxx; xxx = yyy; @@ -495,8 +432,7 @@ int main(int argc, char *argv[]) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, - 1.0 / t * 60 * niter); + printf("solve time = %f , # of solves per min = %f\n", t, 1.0 / t * 60 * niter); } /// @@ -507,140 +443,87 @@ int main(int argc, char *argv[]) { using policy_type = Kokkos::TeamPolicy; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); Kokkos::parallel_for( - "compute residual", policy, - KOKKOS_LAMBDA(const typename policy_type::member_type &member) { + "compute residual", policy, KOKKOS_LAMBDA(const typename policy_type::member_type &member) { const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, Acopy.extent(5)), - [&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, - Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, - Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; - ++jvec) { - auto x = Kokkos::subview(xs, i, jvec, nsweep % 2, - Kokkos::ALL(), Kokkos::ALL(), v); - auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), - Kokkos::ALL(), v); - - if (L == 1) { - auto A0 = - Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); - auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); - auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); - - TeamCopy::invoke(member, b0, r0); - TeamGemv::invoke(member, - -1.0, A0, - x0, 1.0, - r0); - } else { - int k = 0; - { - /// first row - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, - -1.0, - A1, x1, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - B2, x2, - 1.0, - rk); - ++k; - } - for (; k < (L - 1); ++k) { - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = - Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, - -1.0, - C0, x0, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - A1, x1, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - B2, x2, - 1.0, - rk); - } - { - // last row - auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), - Kokkos::ALL()); - auto A1 = - Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv::invoke(member, - -1.0, - C0, x0, - 1.0, - rk); - TeamGemv::invoke(member, - -1.0, - A1, x1, - 1.0, - rk); - } - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)), [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, nsweep % 2, Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, b0, r0); + TeamGemv::invoke(member, -1.0, A0, + x0, 1.0, r0); + } else { + int k = 0; + { + /// first row + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, A1, + x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, + x2, 1.0, rk); + ++k; } - }); + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, + x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, + x1, 1.0, rk); + TeamGemv::invoke(member, -1.0, B2, + x2, 1.0, rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), Kokkos::ALL()); + auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, + x0, 1.0, rk); + TeamGemv::invoke(member, -1.0, A1, + x1, 1.0, rk); + } + } + } + }); }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); @@ -650,13 +533,11 @@ int main(int argc, char *argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; - ++i0) // N/vector_length - for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec - for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L - for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk - for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; - ++i4) { // vector_length + for (int i0 = 0, i0end = rs.extent(0); i0 < i0end; ++i0) // N/vector_length + for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1) // Nvec + for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2) // L + for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3) // Blk + for (int i4 = 0, i4end = rs.extent(4); i4 < i4end; ++i4) { // vector_length const auto val = bs_host(i0, i1, i2, i3, i4); const auto res = rs_host(i0, i1, i2, i3, i4); norm2 += val * val; diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp index 5f9c167b72..9ac7e82d3a 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp @@ -69,8 +69,7 @@ struct Functor { Functor() = default; KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a, const ViewType &b, const ViewType &c) - : _a(a), _b(b), _c(c) {} + Functor(const ViewType &a, const ViewType &b, const ViewType &c) : _a(a), _b(b), _c(c) {} KOKKOS_INLINE_FUNCTION void operator()(const RangeTag &, const int k) const { @@ -78,98 +77,81 @@ struct Functor { auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - SerialGemm::invoke( - 1.0, aa, bb, 1.0, cc); + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, - const MemberType &member) const { - const int kbeg = - (member.league_rank() * (member.team_size() * VectorLength) + - member.team_rank() * VectorLength); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm::invoke(1.0, aa, bb, 1.0, cc); - } - }); + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, const MemberType &member) const { + const int kbeg = (member.league_rank() * (member.team_size() * VectorLength) + member.team_rank() * VectorLength); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - TeamGemm::invoke(member, 1.0, aa, bb, 1.0, cc); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + TeamGemm::invoke(member, 1.0, aa, bb, 1.0, cc); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, const MemberType &member) const { const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, - _a.extent(1), _a.extent(2)); - ScratchViewType sb(member.team_scratch(lvl), VectorLength, - _b.extent(1), _b.extent(2)); + ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); + ScratchViewType sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2)); const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - TeamCopy::invoke(member, bb, sbb); - member.team_barrier(); - - TeamGemm::invoke(member, 1.0, saa, sbb, 1.0, cc); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + TeamCopy::invoke(member, bb, sbb); + member.team_barrier(); + + TeamGemm::invoke(member, 1.0, saa, sbb, 1.0, + cc); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { - const int i = ij % m, j = ij / m; - typename ViewType::non_const_value_type cval = 0; - for (int p = 0; p < q; ++p) - cval += _a(kk, i, p) * _b(kk, p, j); - _c(kk, i, j) += cval; - }); - } + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + const int i = ij % m, j = ij / m; + typename ViewType::non_const_value_type cval = 0; + for (int p = 0; p < q; ++p) cval += _a(kk, i, p) * _b(kk, p, j); + _c(kk, i, j) += cval; }); + } + }); } }; @@ -177,19 +159,15 @@ template void Gemm(const int NN, const int BlkSize) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; - std::cout << "SIMD is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; } const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize); @@ -201,10 +179,8 @@ void Gemm(const int NN, const int BlkSize) { const int iter_begin = -3, iter_end = 30; Kokkos::Timer timer; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), - bmat("bmat", N * VectorLength, BlkSize, BlkSize), - cref("cref", N * VectorLength, BlkSize, BlkSize); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, BlkSize), cref("cref", N * VectorLength, BlkSize, BlkSize); { Random random; @@ -225,12 +201,9 @@ void Gemm(const int NN, const int BlkSize) { /// /// CUBLAS Strided version /// - const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, - BlkSize, 1, BlkSize, BlkSize); + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, BlkSize, 1, BlkSize, BlkSize); - Kokkos::View a( - "a", stride), - b("b", stride), c("c", stride); + Kokkos::View a("a", stride), b("b", stride), c("c", stride); double tavg = 0, tmin = tmax; @@ -238,13 +211,10 @@ void Gemm(const int NN, const int BlkSize) { cublasHandle_t handle; stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - auto amat_device = - Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); - auto bmat_device = - Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); + auto amat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); + auto bmat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); Kokkos::deep_copy(amat_device, amat); Kokkos::deep_copy(bmat_device, bmat); @@ -268,12 +238,10 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - stat = cublasDgemmStridedBatched( - handle, CUBLAS_OP_N, CUBLAS_OP_N, BlkSize, BlkSize, BlkSize, &one, - (const value_type *)a.data(), BlkSize, BlkSize * BlkSize, - (const value_type *)b.data(), BlkSize, BlkSize * BlkSize, &zero, - (value_type *)c.data(), BlkSize, BlkSize * BlkSize, - N * VectorLength); + stat = cublasDgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, BlkSize, BlkSize, BlkSize, &one, + (const value_type *)a.data(), BlkSize, BlkSize * BlkSize, + (const value_type *)b.data(), BlkSize, BlkSize * BlkSize, &zero, + (value_type *)c.data(), BlkSize, BlkSize * BlkSize, N * VectorLength); Kokkos::fence(); const double t = timer.seconds(); @@ -282,16 +250,14 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); Kokkos::deep_copy(cref, csol); std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Strided" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } cublasDestroy(handle); @@ -303,15 +269,13 @@ void Gemm(const int NN, const int BlkSize) { /// Range policy version /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { typedef Functor functor_type; - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush @@ -325,8 +289,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", - policy, functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -335,22 +298,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -365,21 +325,18 @@ void Gemm(const int NN, const int BlkSize) { /// expect the same performance as range policy /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; // 128 is rough estimates - const int team_size = - policy_type(N / 32, Kokkos::AUTO, VectorLength) - .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + const int team_size = policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); const policy_type policy(N / team_size, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -394,8 +351,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", - policy, functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -404,23 +360,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -434,26 +386,21 @@ void Gemm(const int NN, const int BlkSize) { /// Team policy V2 - team parallel /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); policy_type policy(N, team_size, VectorLength); @@ -469,8 +416,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", - policy, functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -479,23 +425,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -509,37 +451,29 @@ void Gemm(const int NN, const int BlkSize) { /// Team policy V3 - team parallel + scratch /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int lvl = 0, - per_team_scratch = 2 * ScratchViewType::shmem_size( - VectorLength, BlkSize, BlkSize); + const int lvl = 0, per_team_scratch = 2 * ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); // std::cout << "per team scratch " << per_team_scratch << "\n"; if (per_team_scratch / 1024 < 48) { - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int max_team_size = policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); policy_type policy = - policy_type(N, team_size, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); + policy_type(N, team_size, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush flush.run(); @@ -552,9 +486,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, - functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -563,23 +495,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = Kokkos::create_mirror_view( - typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = " << std::setw(3) - << (per_team_scratch / 1024) << " time = " << std::scientific - << tmin << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) << (per_team_scratch / 1024) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -587,8 +515,7 @@ void Gemm(const int NN, const int BlkSize) { std::cout << std::endl; } else { std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " Scratch per team is too big:" << std::setw(3) - << (per_team_scratch / 1024) << std::endl; + << " Scratch per team is too big:" << std::setw(3) << (per_team_scratch / 1024) << std::endl; } } } @@ -598,19 +525,16 @@ void Gemm(const int NN, const int BlkSize) { /// Team policy - handmade /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(max_team_size, BlkSize * BlkSize); @@ -627,9 +551,7 @@ void Gemm(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, - functor_type(a, b, c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, functor_type(a, b, c)); Kokkos::fence(); const double t = timer.seconds(); @@ -638,23 +560,19 @@ void Gemm(const int NN, const int BlkSize) { } tavg /= iter_end; - auto csol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); Kokkos::deep_copy(csol, c); double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(cref(i, j, k) - - csol(i, j, k)); + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - csol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team HM" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp index 225e10f63b..cfcbb176fa 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemm_Host.hpp @@ -35,7 +35,7 @@ #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" -//#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { namespace PerfTest { @@ -66,25 +66,20 @@ template void Gemm(const int NN) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } @@ -95,8 +90,7 @@ void Gemm(const int NN) { Kokkos::Timer timer; Kokkos::View cref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), bmat("bmat", N * VectorLength, BlkSize, BlkSize); Kokkos::Random_XorShift64_Pool random(13718); @@ -104,13 +98,11 @@ void Gemm(const int NN) { Kokkos::fill_random(bmat, random, value_type(1.0)); typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize), + Kokkos::View amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, BlkSize); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::Pack", - Kokkos::RangePolicy(0, N * VectorLength), + "KokkosBatched::PerfTest::GemmHost::Pack", Kokkos::RangePolicy(0, N * VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k / VectorLength, k1 = k % VectorLength; for (int i = 0; i < BlkSize; ++i) @@ -129,14 +121,11 @@ void Gemm(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), - c("c", N * VectorLength, BlkSize, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double tavg = 0, tmin = tmax; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -152,24 +141,20 @@ void Gemm(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::CblasOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemmHost::CblasOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); const double one = 1.0; if (std::is_same::value) { - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, - BlkSize, BlkSize, one, (double *)aa.data(), - aa.stride_0(), (double *)bb.data(), bb.stride_0(), - one, (double *)cc.data(), cc.stride_0()); - } else if (std::is_same >::value) { - cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, - BlkSize, BlkSize, (void *)&one, (void *)aa.data(), - aa.stride_0(), (void *)bb.data(), bb.stride_0(), - (void *)&one, (void *)cc.data(), cc.stride_0()); + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, BlkSize, BlkSize, one, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0(), one, + (double *)cc.data(), cc.stride_0()); + } else if (std::is_same >::value) { + cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, BlkSize, BlkSize, (void *)&one, + (void *)aa.data(), aa.stride_0(), (void *)bb.data(), bb.stride_0(), (void *)&one, + (void *)cc.data(), cc.stride_0()); } }); @@ -181,10 +166,8 @@ void Gemm(const int NN) { tavg /= iter_end; std::cout << std::setw(12) << "MKL DGEMM" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; cref = c; } @@ -192,14 +175,11 @@ void Gemm(const int NN) { #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) { - typedef Kokkos::View - ViewType; - ViewType a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), + typedef Kokkos::View ViewType; + ViewType a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); - value_type *aa[N * VectorLength], *bb[N * VectorLength], - *cc[N * VectorLength]; + value_type *aa[N * VectorLength], *bb[N * VectorLength], *cc[N * VectorLength]; for (int k = 0; k < N * VectorLength; ++k) { aa[k] = &a(k, 0, 0); @@ -234,15 +214,11 @@ void Gemm(const int NN) { timer.reset(); if (std::is_same::value) { - cblas_dgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, - blksize, one, (const double **)aa, lda, - (const double **)bb, ldb, one, (double **)cc, ldc, - 1, size_per_grp); + cblas_dgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, blksize, one, (const double **)aa, lda, + (const double **)bb, ldb, one, (double **)cc, ldc, 1, size_per_grp); } else if (std::is_same >::value) { - cblas_zgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, - blksize, one, (const void **)aa, lda, - (const void **)bb, ldb, one, (void **)cc, ldc, 1, - size_per_grp); + cblas_zgemm_batch(CblasRowMajor, transA, transB, blksize, blksize, blksize, one, (const void **)aa, lda, + (const void **)bb, ldb, one, (void **)cc, ldc, 1, size_per_grp); } HostSpaceType().fence(); @@ -255,22 +231,18 @@ void Gemm(const int NN) { double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) - for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - c(i, j, k)); + for (int k = 0, kend = cref.extent(2); k < kend; ++k) diff += abs(cref(i, j, k) - c(i, j, k)); std::cout << std::setw(12) << "MKL Batch" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } #endif #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize); { @@ -306,19 +278,15 @@ void Gemm(const int NN) { timer.reset(); if (std::is_same::value) { - mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, - BlkSize, BlkSize, done, (const double *)a.data(), - (MKL_INT)a.stride_1(), (const double *)b.data(), - (MKL_INT)b.stride_1(), done, (double *)c.data(), - (MKL_INT)c.stride_1(), format, N * VectorLength); - } else if (std::is_same >::value) { - mkl_zgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, - BlkSize, BlkSize, (MKL_Complex16 *)&zone, - (const double *)a.data(), (MKL_INT)a.stride_1(), - (const double *)b.data(), (MKL_INT)b.stride_1(), - (MKL_Complex16 *)&zone, (double *)c.data(), - (MKL_INT)c.stride_1(), format, N * VectorLength); + mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, BlkSize, BlkSize, done, + (const double *)a.data(), (MKL_INT)a.stride_1(), (const double *)b.data(), + (MKL_INT)b.stride_1(), done, (double *)c.data(), (MKL_INT)c.stride_1(), format, + N * VectorLength); + } else if (std::is_same >::value) { + mkl_zgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize, BlkSize, BlkSize, + (MKL_Complex16 *)&zone, (const double *)a.data(), (MKL_INT)a.stride_1(), + (const double *)b.data(), (MKL_INT)b.stride_1(), (MKL_Complex16 *)&zone, + (double *)c.data(), (MKL_INT)c.stride_1(), format, N * VectorLength); } HostSpaceType().fence(); @@ -332,15 +300,12 @@ void Gemm(const int NN) { for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - - c(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(cref(i, j, k) - c(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(12) << "MKL Cmpct" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } } @@ -351,16 +316,13 @@ void Gemm(const int NN) { { libxsmm_init(); - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, BlkSize), - c("c", N * VectorLength, BlkSize, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), c("c", N * VectorLength, BlkSize, BlkSize); libxsmm_blasint lda = a.stride_1(), ldb = b.stride_1(), ldc = c.stride_1(); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double tavg = 0, tmin = tmax; @@ -382,19 +344,15 @@ void Gemm(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); // column major - libxsmm_gemm((const char *)&transA, (const char *)&transB, - blksize, blksize, blksize, (const double *)&one, - (const double *)bb.data(), - (const libxsmm_blasint *)&ldb, - (const double *)aa.data(), - (const libxsmm_blasint *)&lda, (const double *)&one, + libxsmm_gemm((const char *)&transA, (const char *)&transB, blksize, blksize, blksize, + (const double *)&one, (const double *)bb.data(), (const libxsmm_blasint *)&ldb, + (const double *)aa.data(), (const libxsmm_blasint *)&lda, (const double *)&one, (double *)cc.data(), (const libxsmm_blasint *)&ldc); }); @@ -409,15 +367,12 @@ void Gemm(const int NN) { double diff = 0; for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) - for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - c(i, j, k)); + for (int k = 0, kend = cref.extent(2); k < kend; ++k) diff += abs(cref(i, j, k) - c(i, j, k)); std::cout << std::setw(12) << "libxsmm" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } libxsmm_finalize(); } @@ -488,8 +443,7 @@ void Gemm(const int NN) { /// Serial SIMD with appropriate data layout /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize); { @@ -510,14 +464,12 @@ void Gemm(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); - SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); }); HostSpaceType().fence(); @@ -531,15 +483,12 @@ void Gemm(const int NN) { for (int i = 0, iend = cref.extent(0); i < iend; ++i) for (int j = 0, jend = cref.extent(1); j < jend; ++j) for (int k = 0, kend = cref.extent(2); k < kend; ++k) - diff += abs(cref(i, j, k) - - c(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(cref(i, j, k) - c(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(12) << "KK Vector" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } std::cout << std::endl; diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp index 9ae401f03f..e368e8c00b 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Gemv_Host.hpp @@ -15,8 +15,8 @@ //@HEADER /// \author Kyungjoo Kim (kyukim@sandia.gov) -//#define __KOKKOSBATCHED_INTEL_MKL__ -//#define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #define __KOKKOSBATCHED_INTEL_MKL__ +// #define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ #include @@ -60,47 +60,38 @@ double FlopCount(int mm, int nn) { return (FLOP_MUL * (m * n) + FLOP_ADD * (m * n)); } -template +template void Gemv(const int NN) { typedef Kokkos::Schedule ScheduleType; // typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } - const double flop = - (N * VectorLength) * FlopCount(BlkSize, BlkSize) * NumVecs; + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize) * NumVecs; // const double tmax = 1.0e15; const int iter_begin = -10, iter_end = 100; Kokkos::Timer timer; Kokkos::View yref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize); - Kokkos::View xvec( - "xvec", N * VectorLength, NumVecs, BlkSize); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize); + Kokkos::View xvec("xvec", N * VectorLength, NumVecs, BlkSize); Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(xvec, random, value_type(1.0)); @@ -115,14 +106,11 @@ void Gemv(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - x("x", N * VectorLength, NumVecs, BlkSize), - y("y", N * VectorLength, NumVecs, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + x("x", N * VectorLength, NumVecs, BlkSize), y("y", N * VectorLength, NumVecs, BlkSize); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double t = 0; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -138,17 +126,14 @@ void Gemv(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemvHost::CblasOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemvHost::CblasOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); for (int j = 0; j < NumVecs; ++j) { auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); - cblas_dgemv(CblasRowMajor, CblasNoTrans, BlkSize, BlkSize, 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)xx.data(), xx.stride_0(), 1.0, - (double*)yy.data(), yy.stride_0()); + cblas_dgemv(CblasRowMajor, CblasNoTrans, BlkSize, BlkSize, 1.0, (double*)aa.data(), aa.stride_0(), + (double*)xx.data(), xx.stride_0(), 1.0, (double*)yy.data(), yy.stride_0()); } }); @@ -158,10 +143,8 @@ void Gemv(const int NN) { t /= iter_end; std::cout << std::setw(12) << "MKL DGEMV" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop / t) << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t << " flop/s = " << (flop / t) << std::endl; yref = y; } @@ -172,14 +155,11 @@ void Gemv(const int NN) { /// Plain version (comparable to micro BLAS version) /// { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), - x("x", N * VectorLength, NumVecs, BlkSize), - y("y", N * VectorLength, NumVecs, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), + x("x", N * VectorLength, NumVecs, BlkSize), y("y", N * VectorLength, NumVecs, BlkSize); { - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); double t = 0; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -195,16 +175,14 @@ void Gemv(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemvHost::SerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemvHost::SerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); for (int j = 0; j < NumVecs; ++j) { auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); - SerialGemv::invoke(1.0, aa, xx, - 1.0, yy); + SerialGemv::invoke(1.0, aa, xx, 1.0, yy); } }); @@ -217,38 +195,31 @@ void Gemv(const int NN) { for (int i = 0, iend = yref.extent(0); i < iend; ++i) for (int j = 0, jend = yref.extent(1); j < jend; ++j) for (int k = 0, kend = yref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(yref(i, j, k) - - y(i, j, k)); + diff += Kokkos::ArithTraits::abs(yref(i, j, k) - y(i, j, k)); std::cout << std::setw(12) << "Plain" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop / t) << " diff to ref = " << diff + << " BlkSize = " << std::setw(3) << BlkSize << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t << " flop/s = " << (flop / t) << " diff to ref = " << diff << std::endl; } } typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize), + Kokkos::View amat_simd("amat_simd", N, BlkSize, BlkSize), xvec_simd("xvec_simd", N, NumVecs, BlkSize); for (int k0 = 0; k0 < N; ++k0) for (int k1 = 0; k1 < VectorLength; ++k1) for (int i = 0; i < BlkSize; ++i) { - for (int j = 0; j < NumVecs; ++j) - xvec_simd(k0, j, i)[k1] = xvec(k0 * VectorLength + k1, j, i); - for (int j = 0; j < BlkSize; ++j) - amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); + for (int j = 0; j < NumVecs; ++j) xvec_simd(k0, j, i)[k1] = xvec(k0 * VectorLength + k1, j, i); + for (int j = 0; j < BlkSize; ++j) amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); } /// /// Serial SIMD with appropriate data layout /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), x("x", N, NumVecs, BlkSize), y("y", N, NumVecs, BlkSize); { @@ -268,16 +239,14 @@ void Gemv(const int NN) { timer.reset(); Kokkos::parallel_for( - "KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); for (int j = 0; j < NumVecs; ++j) { auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); - SerialGemv::invoke(1.0, aa, xx, - 1.0, yy); + SerialGemv::invoke(1.0, aa, xx, 1.0, yy); } }); @@ -290,14 +259,11 @@ void Gemv(const int NN) { for (int i = 0, iend = yref.extent(0); i < iend; ++i) for (int j = 0, jend = yref.extent(1); j < jend; ++j) for (int k = 0, kend = yref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs( - yref(i, j, k) - y(i / VectorLength, j, k)[i % VectorLength]); + diff += Kokkos::ArithTraits::abs(yref(i, j, k) - y(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(12) << "Serial SIMD" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop / t) << " diff to ref = " << diff + << " BlkSize = " << std::setw(3) << BlkSize << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t << " flop/s = " << (flop / t) << " diff to ref = " << diff << std::endl; } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp index 9909afd943..4d3f7c8fd0 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Cuda.cpp @@ -48,15 +48,11 @@ double FlopCount(int mm, int nn) { double m = (double)mm; double n = (double)nn; if (m > n) - return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + - 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + - FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - - 0.5 * m * n + (1.0 / 6.0) * n)); + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - 0.5 * m * n + (1.0 / 6.0) * n)); else - return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + - 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + - FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - - 0.5 * n * m + (1.0 / 6.0) * m)); + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - 0.5 * n * m + (1.0 / 6.0) * m)); } struct RangeTag {}; @@ -82,57 +78,48 @@ struct Functor { } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, - const MemberType &member) const { - const int kbeg = - (member.league_rank() * (member.team_size() * VectorLength) + - member.team_rank() * VectorLength); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - SerialLU::invoke(aa); - } - }); + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, const MemberType &member) const { + const int kbeg = (member.league_rank() * (member.team_size() * VectorLength) + member.team_rank() * VectorLength); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + SerialLU::invoke(aa); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member, aa); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, aa); + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, const MemberType &member) const { const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, - _a.extent(1), _a.extent(2)); + ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - TeamLU::invoke(member, saa); - member.team_barrier(); - TeamCopy::invoke(member, saa, aa); - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + TeamLU::invoke(member, saa); + member.team_barrier(); + TeamCopy::invoke(member, saa, aa); + } + }); } }; @@ -140,19 +127,15 @@ template void LU(const int NN, const int BlkSize) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; - std::cout << "SIMD is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; } const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize); @@ -164,8 +147,7 @@ void LU(const int NN, const int BlkSize) { const int iter_begin = -3, iter_end = 50; Kokkos::Timer timer; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), aref("aref", N * VectorLength, BlkSize, BlkSize); { @@ -202,22 +184,18 @@ void LU(const int NN, const int BlkSize) { /// /// CUBLAS Batch version /// - const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, - BlkSize, 1, BlkSize, BlkSize); + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, BlkSize, 1, BlkSize, BlkSize); - Kokkos::View a( - "a", stride); + Kokkos::View a("a", stride); Kokkos::View info("info", N * VectorLength); cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - auto amat_device = Kokkos::create_mirror_view( - typename DeviceSpaceType::memory_space(), amat); + auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); Kokkos::deep_copy(amat_device, amat); Kokkos::fence(); @@ -229,12 +207,10 @@ void LU(const int NN, const int BlkSize) { aa[k] = a.data() + k * a.stride_0(); } value_type **aa_device; - if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != - cudaSuccess) { + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != cudaSuccess) { Kokkos::abort("CUDA memory allocation failed\n"); } - if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, - cudaMemcpyHostToDevice) != cudaSuccess) { + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, cudaMemcpyHostToDevice) != cudaSuccess) { Kokkos::abort("CUDA memcpy failed\n"); } Kokkos::fence(); @@ -248,8 +224,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - stat = cublasDgetrfBatched(handle, BlkSize, (value_type **)aa_device, - BlkSize, NULL, (int *)info.data(), + stat = cublasDgetrfBatched(handle, BlkSize, (value_type **)aa_device, BlkSize, NULL, (int *)info.data(), N * VectorLength); if (stat != CUBLAS_STATUS_SUCCESS) { Kokkos::abort("CUBLAS LU Batched failed\n"); @@ -262,8 +237,7 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); Kokkos::deep_copy(aref, asol); @@ -274,8 +248,7 @@ void LU(const int NN, const int BlkSize) { std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batch" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } } @@ -291,8 +264,7 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { typedef Functor functor_type; - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush @@ -304,8 +276,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag", - policy, functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag", policy, functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -314,22 +285,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -346,13 +314,11 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int team_size = - policy_type(N / 32, Kokkos::AUTO, VectorLength) - .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + const int team_size = policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); const policy_type policy(N / team_size, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -365,8 +331,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", - policy, functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", policy, functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -375,23 +340,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -408,13 +369,11 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::LU::Blocked::mb(); + const int is_blocked_algo = (std::is_same::value), + mb = Algo::LU::Blocked::mb(); // mp = BlkSize%mb > 0; const int @@ -422,8 +381,7 @@ void LU(const int NN, const int BlkSize) { mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); const policy_type policy(N, team_size, VectorLength); @@ -437,8 +395,7 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", - policy, functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", policy, functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -447,23 +404,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -480,27 +433,22 @@ void LU(const int NN, const int BlkSize) { double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int lvl = 0, - per_team_scratch = ScratchViewType::shmem_size( - VectorLength, BlkSize, BlkSize); + const int lvl = 0, per_team_scratch = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); if (per_team_scratch / 1024 < 48) { - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::LU::Blocked::mb(); + const int is_blocked_algo = (std::is_same::value), + mb = Algo::LU::Blocked::mb(); // mp = BlkSize%mb > 0; const int // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int max_team_size = policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); policy_type policy(N, team_size, VectorLength); @@ -514,10 +462,8 @@ void LU(const int NN, const int BlkSize) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUCuda::TeamTagV3", - policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)), - functor_type(a)); + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV3", + policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)), functor_type(a)); Kokkos::fence(); const double t = timer.seconds(); @@ -526,23 +472,19 @@ void LU(const int NN, const int BlkSize) { } tavg /= iter_end; - auto asol = Kokkos::create_mirror_view( - typename HostSpaceType::memory_space(), a); + auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); Kokkos::deep_copy(asol, a); double diff = 0; for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(aref(i, j, k) - - asol(i, j, k)); + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - asol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = " << std::setw(3) - << (per_team_scratch / 1024) << " time = " << std::scientific - << tmin << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) << (per_team_scratch / 1024) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -550,8 +492,7 @@ void LU(const int NN, const int BlkSize) { std::cout << std::endl; } else { std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " Scratch per team is too big (KB): " - << (per_team_scratch / 1024) << std::endl; + << " Scratch per team is too big (KB): " << (per_team_scratch / 1024) << std::endl; } } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp index d17f9b9003..f27365694a 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_LU_Host.hpp @@ -15,8 +15,8 @@ //@HEADER /// \author Kyungjoo Kim (kyukim@sandia.gov) -//#define __KOKKOSBATCHED_INTEL_MKL__ -//#define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #define __KOKKOSBATCHED_INTEL_MKL__ +// #define __KOKKOSBATCHED_INTEL_MKL_BATCHED__ #include #include "KokkosBatched_Util.hpp" @@ -57,15 +57,11 @@ double FlopCount(int mm, int nn) { double m = (double)mm; double n = (double)nn; if (m > n) - return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + - 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + - FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - - 0.5 * m * n + (1.0 / 6.0) * n)); + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - 0.5 * m * n + (1.0 / 6.0) * n)); else - return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + - 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + - FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - - 0.5 * n * m + (1.0 / 6.0) * m)); + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - 0.5 * n * m + (1.0 / 6.0) * m)); } template @@ -73,26 +69,21 @@ void LU(const int NN) { typedef Kokkos::Schedule ScheduleType; // typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } @@ -106,8 +97,7 @@ void LU(const int NN) { /// Reference version using MKL DGETRF /// Kokkos::View aref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize); Random random; @@ -124,12 +114,11 @@ void LU(const int NN) { } typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); + Kokkos::View amat_simd("amat_simd", N, BlkSize, + BlkSize); //, a("a", N, BlkSize, BlkSize); Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUHost::Pack", - Kokkos::RangePolicy(0, N * VectorLength), + "KokkosBatched::PerfTest::LUHost::Pack", Kokkos::RangePolicy(0, N * VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k / VectorLength, k1 = k % VectorLength; for (int i = 0; i < BlkSize; ++i) @@ -147,10 +136,8 @@ void LU(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize); - Kokkos::View p( - "p", N * VectorLength, BlkSize); + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize); + Kokkos::View p("p", N * VectorLength, BlkSize); { double tavg = 0, tmin = tmax; for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -163,16 +150,12 @@ void LU(const int NN) { HostSpaceType().fence(); timer.reset(); - Kokkos::RangePolicy policy( - 0, N * VectorLength); + Kokkos::RangePolicy policy(0, N * VectorLength); Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto pp = Kokkos::subview(p, k, Kokkos::ALL()); - LAPACKE_dgetrf(LAPACK_ROW_MAJOR, BlkSize, BlkSize, - (double*)aa.data(), aa.stride_0(), - (int*)pp.data()); + LAPACKE_dgetrf(LAPACK_ROW_MAJOR, BlkSize, BlkSize, (double*)aa.data(), aa.stride_0(), (int*)pp.data()); }); HostSpaceType().fence(); @@ -183,10 +166,8 @@ void LU(const int NN) { tavg /= iter_end; std::cout << std::setw(10) << "MKL LU" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } aref = a; @@ -197,8 +178,7 @@ void LU(const int NN) { #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) { - Kokkos::View a( - "a", N, BlkSize, BlkSize); + Kokkos::View a("a", N, BlkSize, BlkSize); { double tavg = 0, tmin = tmax; @@ -220,8 +200,7 @@ void LU(const int NN) { HostSpaceType().fence(); timer.reset(); - mkl_dgetrfnp_compact(MKL_ROW_MAJOR, BlkSize, BlkSize, - (double*)a.data(), a.stride_1(), (MKL_INT*)&info, + mkl_dgetrfnp_compact(MKL_ROW_MAJOR, BlkSize, BlkSize, (double*)a.data(), a.stride_1(), (MKL_INT*)&info, format, (MKL_INT)N * VectorLength); HostSpaceType().fence(); @@ -235,15 +214,12 @@ void LU(const int NN) { for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += abs(aref(i, j, k) - - a(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(aref(i, j, k) - a(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "MKL Cmpt" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } } @@ -307,8 +283,7 @@ void LU(const int NN) { /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize); + Kokkos::View a("a", N, BlkSize, BlkSize); { double tavg = 0, tmin = tmax; @@ -324,8 +299,7 @@ void LU(const int NN) { Kokkos::RangePolicy policy(0, N); Kokkos::parallel_for( - "KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); SerialLU::invoke(aa); @@ -342,14 +316,11 @@ void LU(const int NN) { for (int i = 0, iend = aref.extent(0); i < iend; ++i) for (int j = 0, jend = aref.extent(1); j < jend; ++j) for (int k = 0, kend = aref.extent(2); k < kend; ++k) - diff += abs(aref(i, j, k) - - a(i / VectorLength, j, k)[i % VectorLength]); + diff += abs(aref(i, j, k) - a(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "SIMD" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff + << std::endl; } } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp index f99ee9dc80..99f1a1d537 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp @@ -50,15 +50,13 @@ typedef double value_type; double FlopCountLower(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } double FlopCountUpper(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } struct RangeTag {}; @@ -67,8 +65,7 @@ struct TeamTagV2 {}; struct TeamTagV3 {}; struct TeamTagHandmade {}; -template +template struct Functor { ConstUnmanagedViewType _a; UnmanagedViewType _b; @@ -86,160 +83,131 @@ struct Functor { switch (test) { case 0: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 1: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 2: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 3: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 4: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; } } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, - const MemberType &member) const { - const int kbeg = - (member.league_rank() * (member.team_size() * VectorLength) + - member.team_rank() * VectorLength); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 1: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 2: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 3: - SerialTrsm::invoke(1.0, aa, bb); - break; - case 4: - SerialTrsm::invoke(1.0, aa, bb); - break; - } - } - }); + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, const MemberType &member) const { + const int kbeg = (member.league_rank() * (member.team_size() * VectorLength) + member.team_rank() * VectorLength); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 1: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 2: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 3: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 4: + SerialTrsm::invoke(1.0, aa, bb); + break; + } + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, const MemberType &member) const { const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 1: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 2: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 3: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - case 4: - TeamTrsm::invoke(member, 1.0, aa, bb); - break; - } - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 1: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 2: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 3: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + case 4: + TeamTrsm::invoke( + member, 1.0, aa, bb); + break; + } + } + }); } template - KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, - const MemberType &member) const { + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, const MemberType &member) const { const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, - _a.extent(1), _a.extent(2)); + ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); // ScratchViewType sb(member.team_scratch(lvl), VectorLength, // _b.extent(1), _b.extent(2)); const int kbeg = member.league_rank() * VectorLength; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - - switch (test) { - case 0: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 1: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 2: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 3: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - case 4: - TeamTrsm::invoke(member, 1.0, saa, bb); - break; - } - } - }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + + switch (test) { + case 0: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 1: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 2: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 3: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + case 4: + TeamTrsm::invoke( + member, 1.0, saa, bb); + break; + } + } + }); } }; @@ -247,19 +215,15 @@ template void Trsm(const int NN, const int BlkSize, const int NumCols) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; - std::cout << "SIMD is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; } switch (test) { @@ -288,17 +252,14 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { const int iter_begin = -3, iter_end = 30; Kokkos::Timer timer; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), - bmat("bmat", N * VectorLength, BlkSize, NumCols), - bref("bmat", N * VectorLength, BlkSize, NumCols); + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, NumCols), bref("bmat", N * VectorLength, BlkSize, NumCols); { Random random; for (int k = 0; k < N * VectorLength; ++k) { for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - amat(k, i, j) = random.value() + 4.0 * (i == j); + for (int j = 0; j < BlkSize; ++j) amat(k, i, j) = random.value() + 4.0 * (i == j); for (int i = 0; i < BlkSize; ++i) for (int j = 0; j < NumCols; ++j) bmat(k, i, j) = random.value(); } @@ -313,24 +274,18 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// /// CUBLAS Batch version /// - const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, - BlkSize, 1, BlkSize, BlkSize); + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, BlkSize, 1, BlkSize, BlkSize); - Kokkos::View a( - "a", stride), - b("b", stride); + Kokkos::View a("a", stride), b("b", stride); cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - auto amat_device = Kokkos::create_mirror_view( - typename DeviceSpaceType::memory_space(), amat); - auto bmat_device = Kokkos::create_mirror_view( - typename DeviceSpaceType::memory_space(), bmat); + auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); + auto bmat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), bmat); Kokkos::deep_copy(amat_device, amat); Kokkos::deep_copy(bmat_device, bmat); @@ -346,16 +301,12 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { bb[k] = b.data() + k * b.stride_0(); } value_type **aa_device, **bb_device; - if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != - cudaSuccess || - cudaMalloc(&bb_device, N * VectorLength * sizeof(value_type *)) != - cudaSuccess) { + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != cudaSuccess || + cudaMalloc(&bb_device, N * VectorLength * sizeof(value_type *)) != cudaSuccess) { Kokkos::abort("CUDA memory allocation failed\n"); } - if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, - cudaMemcpyHostToDevice) != cudaSuccess || - cudaMemcpy(bb_device, bb, sizeof(value_type *) * N * VectorLength, - cudaMemcpyHostToDevice) != cudaSuccess) { + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, cudaMemcpyHostToDevice) != cudaSuccess || + cudaMemcpy(bb_device, bb, sizeof(value_type *) * N * VectorLength, cudaMemcpyHostToDevice) != cudaSuccess) { Kokkos::abort("CUDA memcpy failed\n"); } Kokkos::fence(); @@ -371,47 +322,37 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { switch (test) { case 0: { // Left, Lower, NoTrans, UnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, - CUBLAS_DIAG_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, + BlkSize, NumCols, &one, (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 1: { // Left, Lower, NoTrans, NonUnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, (const value_type **)aa_device, + BlkSize, (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 2: { // Right, Upper, NoTrans, UnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, - CUBLAS_DIAG_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, + BlkSize, NumCols, &one, (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 3: { // Right, Upper, NoTrans, NonUnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, (const value_type **)aa_device, + BlkSize, (value_type **)bb_device, BlkSize, N * VectorLength); break; } case 4: { // Left, Upper, NoTrans, NonUnitDiag - stat = cublasDtrsmBatched( - handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, - (const value_type **)aa_device, BlkSize, - (value_type **)bb_device, BlkSize, N * VectorLength); + stat = cublasDtrsmBatched(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, (const value_type **)aa_device, + BlkSize, (value_type **)bb_device, BlkSize, N * VectorLength); break; } } @@ -426,22 +367,19 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); Kokkos::deep_copy(bref, bsol); - if (cudaFree(aa_device) != cudaSuccess || - cudaFree(bb_device) != cudaSuccess) { + if (cudaFree(aa_device) != cudaSuccess || cudaFree(bb_device) != cudaSuccess) { Kokkos::abort("CUDA memory free failed\n"); } std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batched" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = N/A" << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin) << std::endl; } cublasDestroy(handle); @@ -453,14 +391,12 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Range policy version /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { typedef Functor functor_type; - const Kokkos::RangePolicy policy( - 0, N * VectorLength); + const Kokkos::RangePolicy policy(0, N * VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { // flush @@ -473,8 +409,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, functor_type(a, b)); Kokkos::fence(); const double t = timer.seconds(); @@ -483,23 +418,20 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = N/A" << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -513,18 +445,15 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Team policy V1 - almost same scheduling with range policy /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int team_size = - policy_type(N / 32, Kokkos::AUTO, VectorLength) - .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + const int team_size = policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); const policy_type policy(N / team_size, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -538,8 +467,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { Kokkos::fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, functor_type(a, b)); Kokkos::fence(); const double t = timer.seconds(); @@ -548,24 +476,19 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -579,27 +502,21 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Team policy V2 - team parallel /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = - std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); const policy_type policy(N, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -613,8 +530,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { DeviceSpaceType().fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, functor_type(a, b)); DeviceSpaceType().fence(); const double t = timer.seconds(); @@ -623,24 +539,19 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = - Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; @@ -654,33 +565,25 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { /// Team policy V3 - team parallel + sratch /// typedef Kokkos::View view_type; - view_type a("a", N * VectorLength, BlkSize, BlkSize), - b("b", N * VectorLength, BlkSize, NumCols); + view_type a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); double tavg = 0, tmin = tmax; { - typedef Kokkos::TeamPolicy - policy_type; + typedef Kokkos::TeamPolicy policy_type; typedef Functor functor_type; - const int lvl = 0, - per_team_scratch = ScratchViewType::shmem_size( - VectorLength, BlkSize, BlkSize); + const int lvl = 0, per_team_scratch = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); if (per_team_scratch / 1024 < 48) { - const int is_blocked_algo = - (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize % mb > 0; + const int is_blocked_algo = (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), mp = BlkSize % mb > 0; const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength) - .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = - std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + const int max_team_size = policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); policy_type policy(N, team_size, VectorLength); for (int iter = iter_begin; iter < iter_end; ++iter) { @@ -694,8 +597,7 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { DeviceSpaceType().fence(); timer.reset(); - Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, - functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, functor_type(a, b)); DeviceSpaceType().fence(); const double t = timer.seconds(); @@ -704,33 +606,27 @@ void Trsm(const int NN, const int BlkSize, const int NumCols) { } tavg /= iter_end; - auto bsol = Kokkos::create_mirror_view( - typename HostSpaceType::memory_space(), b); + auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); Kokkos::deep_copy(bsol, b); double diff = 0; for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - bsol(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - bsol(i, j, k)); std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = " << std::setw(3) << team_size - << " ScratchSize (KB) = " << std::setw(3) - << (per_team_scratch / 1024) << " time = " << std::scientific - << tmin << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin); + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size << " ScratchSize (KB) = " << std::setw(3) + << (per_team_scratch / 1024) << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) std::cout << " diff to ref = " << diff; #endif std::cout << std::endl; } else { std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" - << " Scratch per team is too big (KB): " - << (per_team_scratch / 1024) << std::endl; + << " Scratch per team is too big (KB): " << (per_team_scratch / 1024) << std::endl; } } } diff --git a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp index 52b2395b8d..5e8c6a6abc 100644 --- a/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp +++ b/perf_test/batched/dense/do-not-use/KokkosBatched_Test_Trsm_Host.hpp @@ -30,7 +30,7 @@ #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Trsm_Serial_Impl.hpp" -//#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ +// #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { namespace PerfTest { @@ -54,41 +54,33 @@ typedef double value_type; double FlopCountLower(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } double FlopCountUpper(int mm, int nn) { double m = (double)mm; double n = (double)nn; - return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + - FLOP_ADD * (0.5 * m * n * (n - 1.0))); + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + FLOP_ADD * (0.5 * m * n * (n - 1.0))); } -template +template void Trsm(const int NN) { typedef Kokkos::Schedule ScheduleType; - constexpr int VectorLength = - DefaultVectorLength::value; - const int N = NN / VectorLength; + constexpr int VectorLength = DefaultVectorLength::value; + const int N = NN / VectorLength; { std::string value_type_name; if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) - value_type_name = "Kokkos::complex"; + if (std::is_same >::value) value_type_name = "Kokkos::complex"; #if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name - << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " - << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " + << VectorLength << "\n"; #endif } @@ -120,13 +112,11 @@ void Trsm(const int NN) { /// Reference version using MKL DTRSM /// Kokkos::View bref; - Kokkos::View amat( - "amat", N * VectorLength, BlkSize, BlkSize), + Kokkos::View amat("amat", N * VectorLength, BlkSize, BlkSize), bmat("bmat", N * VectorLength, BlkSize, NumCols); typedef Vector, VectorLength> VectorType; - Kokkos::View amat_simd( - "amat_simd", N, BlkSize, BlkSize), + Kokkos::View amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, NumCols); Random random; @@ -154,8 +144,7 @@ void Trsm(const int NN) { /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); { @@ -171,44 +160,32 @@ void Trsm(const int NN) { HostSpaceType().fence(); timer.reset(); - Kokkos::RangePolicy policy( - 0, N * VectorLength); + Kokkos::RangePolicy policy(0, N * VectorLength); Kokkos::parallel_for( - "KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); switch (test) { case 0: - cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, - CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 1: - cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, - CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 2: - cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, - CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 3: - cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, - CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; case 4: - cblas_dtrsm(CblasRowMajor, CblasLeft, CblasUpper, - CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, - (double *)aa.data(), aa.stride_0(), - (double *)bb.data(), bb.stride_0()); + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), (double *)bb.data(), bb.stride_0()); break; } }); @@ -223,24 +200,19 @@ void Trsm(const int NN) { double sum = 0; for (int i = 0, iend = b.extent(0); i < iend; ++i) for (int j = 0, jend = b.extent(1); j < jend; ++j) - for (int k = 0, kend = b.extent(2); k < kend; ++k) - sum += Kokkos::ArithTraits::abs(bmat(i, j, k)); + for (int k = 0, kend = b.extent(2); k < kend; ++k) sum += Kokkos::ArithTraits::abs(bmat(i, j, k)); std::cout << std::setw(10) << "MKL TRSM" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) << " sum abs(B) = " << sum - << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " sum abs(B) = " << sum << std::endl; bref = b; } } #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) { - Kokkos::View a( - "a", N * VectorLength, BlkSize, BlkSize), + Kokkos::View a("a", N * VectorLength, BlkSize, BlkSize), b("b", N * VectorLength, BlkSize, NumCols); value_type *aa[N * VectorLength], *bb[N * VectorLength]; @@ -280,8 +252,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -291,8 +262,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasNonUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -302,8 +272,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -313,8 +282,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasNonUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -324,8 +292,7 @@ void Trsm(const int NN) { CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; CBLAS_DIAG diag[1] = {CblasNonUnit}; - cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, - numcols, one, (const double **)aa, lda, + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, numcols, one, (const double **)aa, lda, (double **)bb, ldb, 1, size_per_grp); break; } @@ -342,24 +309,19 @@ void Trsm(const int NN) { for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs(bref(i, j, k) - - b(i, j, k)); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - b(i, j, k)); std::cout << std::setw(10) << "MKL Batch" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff << std::endl; } } #endif #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, NumCols); { @@ -392,10 +354,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_UNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 1: { @@ -404,10 +365,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_NONUNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 2: { @@ -416,10 +376,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_UNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 3: { @@ -428,10 +387,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_NONUNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } case 4: { @@ -440,10 +398,9 @@ void Trsm(const int NN) { MKL_TRANSPOSE transA = MKL_NOTRANS; MKL_DIAG diag = MKL_NONUNIT; - mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, - BlkSize, NumCols, one, (const double *)a.data(), - a.stride_1(), (double *)b.data(), b.stride_1(), - format, (MKL_INT)N * VectorLength); + mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag, BlkSize, NumCols, one, + (const double *)a.data(), a.stride_1(), (double *)b.data(), b.stride_1(), format, + (MKL_INT)N * VectorLength); break; } } @@ -459,16 +416,12 @@ void Trsm(const int NN) { for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs( - bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "MKL Cmpt" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff << std::endl; } } } @@ -557,8 +510,7 @@ void Trsm(const int NN) { /// SIMD with appropriate data layout /// { - Kokkos::View a( - "a", N, BlkSize, BlkSize), + Kokkos::View a("a", N, BlkSize, BlkSize), b("b", N, BlkSize, NumCols); { @@ -576,31 +528,29 @@ void Trsm(const int NN) { Kokkos::RangePolicy policy(0, N); Kokkos::parallel_for( - "KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", policy, - KOKKOS_LAMBDA(const int k) { + "KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); switch (test) { case 0: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 1: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; case 2: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; case 3: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; case 4: - SerialTrsm::invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, + bb); break; } }); @@ -616,16 +566,12 @@ void Trsm(const int NN) { for (int i = 0, iend = bref.extent(0); i < iend; ++i) for (int j = 0, jend = bref.extent(1); j < jend; ++j) for (int k = 0, kend = bref.extent(2); k < kend; ++k) - diff += Kokkos::ArithTraits::abs( - bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); std::cout << std::setw(10) << "KK Vector" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop / tavg) - << " max flop/s = " << (flop / tmin) - << " diff to ref = " << diff << std::endl; + << " BlkSize = " << std::setw(3) << BlkSize << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " diff to ref = " << diff << std::endl; } } std::cout << "\n\n"; diff --git a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp index e289f8fa52..5722480212 100644 --- a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp +++ b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_1.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorCG_1 { const ValuesViewType _D; const IntView _r; @@ -26,12 +26,9 @@ struct Functor_TestBatchedTeamVectorCG_1 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorCG_1(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team, - const int team_size, - const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorCG_1(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -47,20 +44,15 @@ struct Functor_TestBatchedTeamVectorCG_1 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; Operator A(d, _r, _c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, _handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, _handle); } inline double run() { @@ -70,13 +62,10 @@ struct Functor_TestBatchedTeamVectorCG_1 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), 4 * _X.extent(1)); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType("", _X.extent(0), 4 * _X.extent(1)); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) diff --git a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp index b3451938c5..5749d640d0 100644 --- a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp +++ b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_2.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorCG_2 { const ValuesViewType _D; const IntView _r; @@ -26,12 +26,9 @@ struct Functor_TestBatchedTeamVectorCG_2 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorCG_2(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team, - const int team_size, - const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorCG_2(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -47,41 +44,27 @@ struct Functor_TestBatchedTeamVectorCG_2 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - using ScratchPadIntViewType = - Kokkos::View; + using ScratchPadIntViewType = Kokkos::View; - using Operator = - KokkosBatched::CrsMatrix; + using Operator = KokkosBatched::CrsMatrix; - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, _handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, _handle); } inline double run() { @@ -91,13 +74,10 @@ struct Functor_TestBatchedTeamVectorCG_2 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), 4 * _X.extent(1)); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType("", _X.extent(0), 4 * _X.extent(1)); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) diff --git a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp index 3dbfca7f15..9df01fd5f0 100644 --- a/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp +++ b/perf_test/batched/sparse/CG/Functor_TestBatchedTeamVectorCG_3.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorCG_3 { const ValuesViewType _D; const IntView _r; @@ -26,12 +26,9 @@ struct Functor_TestBatchedTeamVectorCG_3 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorCG_3(const ValuesViewType &D, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team, - const int team_size, - const int vector_length, - KrylovHandleType &handle) + Functor_TestBatchedTeamVectorCG_3(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -47,41 +44,27 @@ struct Functor_TestBatchedTeamVectorCG_3 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - using ScratchPadIntViewType = - Kokkos::View; + using ScratchPadIntViewType = Kokkos::View; - using Operator = - KokkosBatched::CrsMatrix; + using Operator = KokkosBatched::CrsMatrix; - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); - KokkosBatched::TeamVectorCG::template invoke( - member, A, b, x, _handle); + KokkosBatched::TeamVectorCG::template invoke(member, A, b, x, _handle); } inline double run() { @@ -91,10 +74,8 @@ struct Functor_TestBatchedTeamVectorCG_3 { _handle.set_memory_strategy(0); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -106,7 +87,7 @@ struct Functor_TestBatchedTeamVectorCG_3 { size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_0 = ValuesViewType::shmem_size(_N_team, 5); - size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 4 * _X.extent(1)); + size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 4 * _X.extent(1)); policy.set_scratch_size(0, Kokkos::PerTeam(bytes_int + bytes_0 + bytes_1)); diff --git a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp index 5bf6061fe4..e0440ddbfd 100644 --- a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp +++ b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp @@ -73,50 +73,41 @@ int main(int argc, char *argv[]) { for (int i = 1; i < argc; ++i) { const std::string &token = argv[i]; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos Batched CG performance test options:" << std::endl - << "-A : Filename of the input batched matrix." - << std::endl - << "-B : Filename of the input batched right-hand " - "side." - << std::endl - << "-X : Filename of the output batched solution." - << std::endl - << "-res : Filename of the output residual history." - << std::endl - << "-timers : Filename of the output timers." - << std::endl - << "-n1 : Number of repetitions of the experience." - << std::endl - << "-n2 : Number of the kernel calls inside one " - "experience." - << std::endl - << "-team_size : Used team size." << std::endl - << "-n_implementations: Number of implementations to use: test " - "all " - "implementations [0, specified number -1]." - << std::endl - << "-implementation : Specify only one implementation at a time." - << std::endl - << " Note: implementation 0 : use scratch pad " - "only for scalar temporary variable." - << std::endl - << " Note: implementation 1 : use scratch pad " - "for scalar temporary variables and for the graph of the " - "matrices." - << std::endl - << " Note: implementation 2 : use scratch pad " - "for scalar and vector temporary variables and for the graph of " - "the matrices." - << std::endl - << "-l : Specify left layout." << std::endl - << "-r : Specify right layout." << std::endl - << "-C : Specify if the convergence is monitored." - << std::endl - << "-N_team : Specify the number of systems per team." - << std::endl - << "-vector_length : Specify the vector length." << std::endl - << std::endl; + std::cout << "Kokkos Batched CG performance test options:" << std::endl + << "-A : Filename of the input batched matrix." << std::endl + << "-B : Filename of the input batched right-hand " + "side." + << std::endl + << "-X : Filename of the output batched solution." << std::endl + << "-res : Filename of the output residual history." << std::endl + << "-timers : Filename of the output timers." << std::endl + << "-n1 : Number of repetitions of the experience." << std::endl + << "-n2 : Number of the kernel calls inside one " + "experience." + << std::endl + << "-team_size : Used team size." << std::endl + << "-n_implementations: Number of implementations to use: test " + "all " + "implementations [0, specified number -1]." + << std::endl + << "-implementation : Specify only one implementation at a time." << std::endl + << " Note: implementation 0 : use scratch pad " + "only for scalar temporary variable." + << std::endl + << " Note: implementation 1 : use scratch pad " + "for scalar temporary variables and for the graph of the " + "matrices." + << std::endl + << " Note: implementation 2 : use scratch pad " + "for scalar and vector temporary variables and for the graph of " + "the matrices." + << std::endl + << "-l : Specify left layout." << std::endl + << "-r : Specify right layout." << std::endl + << "-C : Specify if the convergence is monitored." << std::endl + << "-N_team : Specify the number of systems per team." << std::endl + << "-vector_length : Specify the vector length." << std::endl + << std::endl; return 0; } if (token == std::string("-A")) name_A = argv[++i]; @@ -131,10 +122,8 @@ int main(int argc, char *argv[]) { if (token == std::string("-n1")) n_rep_1 = std::atoi(argv[++i]); if (token == std::string("-n2")) n_rep_2 = std::atoi(argv[++i]); if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -144,10 +133,8 @@ int main(int argc, char *argv[]) { layout_right = true; } if (token == std::string("-C")) monitor_convergence = true; - if (token == std::string("-N_team")) - N_team_potential = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); + if (token == std::string("-N_team")) N_team_potential = std::atoi(argv[++i]); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); } int N, Blk, nnz, ncols; @@ -157,16 +144,14 @@ int main(int argc, char *argv[]) { if (impls.size() == 0) for (int i = 0; i < n_impl; ++i) impls.push_back(i); - std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk - << ", N = " << N << ", team_size = " << team_size - << ", vector_length = " << vector_length << std::endl; + std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk << ", N = " << N + << ", team_size = " << team_size << ", vector_length = " << vector_length << std::endl; // V100 L2 cache 6MB per core constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf(" :::: CG Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", - N, Blk, nnz, vector_length, n_rep_1); + printf(" :::: CG Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", N, Blk, nnz, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -193,12 +178,9 @@ int main(int argc, char *argv[]) { XYTypeLL yLL("values", N, Blk); if (layout_left) - printf(" :::: Testing left layout (team_size = %d, vector_length = %d)\n", - team_size, vector_length); + printf(" :::: Testing left layout (team_size = %d, vector_length = %d)\n", team_size, vector_length); if (layout_right) - printf( - " :::: Testing right layout (team_size = %d, vector_length = %d)\n", - team_size, vector_length); + printf(" :::: Testing right layout (team_size = %d, vector_length = %d)\n", team_size, vector_length); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); @@ -226,9 +208,7 @@ int main(int argc, char *argv[]) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KokkosBatched::KrylovHandle; + using KrylovHandleType = KokkosBatched::KrylovHandle; KrylovHandleType handle(N, N_team); handle.set_scratch_pad_level(0); @@ -246,56 +226,38 @@ int main(int argc, char *argv[]) { if (i_impl == 0 && layout_left) { t_spmv += - Functor_TestBatchedTeamVectorCG_1( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_1( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 1 && layout_left) { t_spmv += - Functor_TestBatchedTeamVectorCG_2( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_2( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 2 && layout_left) { t_spmv += - Functor_TestBatchedTeamVectorCG_3( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_3( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 0 && layout_right) { t_spmv += - Functor_TestBatchedTeamVectorCG_1( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_1( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 1 && layout_right) { t_spmv += - Functor_TestBatchedTeamVectorCG_2( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_2( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, handle) .run(); } if (i_impl == 2 && layout_right) { t_spmv += - Functor_TestBatchedTeamVectorCG_3( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, handle) + Functor_TestBatchedTeamVectorCG_3( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, handle) .run(); } exec_space().fence(); @@ -310,10 +272,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -326,15 +286,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); @@ -343,8 +298,7 @@ int main(int argc, char *argv[]) { writeArrayToMM(name_X + std::to_string(i_impl) + "_r.mm", xLR); } if (monitor_convergence) { - writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", - handle.residual_norms); + writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", handle.residual_norms); } } } diff --git a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp index 0640ac8151..068960bbb6 100644 --- a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp +++ b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_1.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorGMRES_1 { const ValuesViewType _D; const ValuesViewType _diag; @@ -32,12 +32,11 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_1( - const ValuesViewType &D, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int N_team, - const int team_size, const int vector_length, const int N_iteration, - const double tol, const int ortho_strategy, const int arnoldi_level, - const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_1(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, const int N_iteration, + const double tol, const int ortho_strategy, const int arnoldi_level, + const int other_level, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -54,12 +53,11 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_1( - const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, - const IntView &c, const VectorViewType &X, const VectorViewType &B, - const int N_team, const int team_size, const int vector_length, - const int N_iteration, const double tol, int ortho_strategy, - const int arnoldi_level, const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_1(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int N_team, const int team_size, const int vector_length, + const int N_iteration, const double tol, int ortho_strategy, + const int arnoldi_level, const int other_level, KrylovHandleType &handle) : _D(D), _diag(diag), _r(r), @@ -81,31 +79,25 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using Operator = KokkosBatched::CrsMatrix; Operator A(d, _r, _c); if (UsePrec) { - auto diag = Kokkos::subview( - _diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); using PrecOperator = KokkosBatched::JacobiPrec; PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -118,13 +110,11 @@ struct Functor_TestBatchedTeamVectorGMRES_1 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), _X.extent(1) + maximum_iteration + 3); + _handle.tmp_view = + typename KrylovHandleType::TemporaryViewType("", _X.extent(0), _X.extent(1) + maximum_iteration + 3); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) diff --git a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp index 3970b7e94a..22e735c304 100644 --- a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp +++ b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_2.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorGMRES_2 { const ValuesViewType _D; const ValuesViewType _diag; @@ -32,12 +32,11 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_2( - const ValuesViewType &D, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int N_team, - const int team_size, const int vector_length, const int N_iteration, - const double tol, const int ortho_strategy, const int arnoldi_level, - const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_2(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, const int N_iteration, + const double tol, const int ortho_strategy, const int arnoldi_level, + const int other_level, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -54,12 +53,11 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_2( - const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, - const IntView &c, const VectorViewType &X, const VectorViewType &B, - const int N_team, const int team_size, const int vector_length, - const int N_iteration, const double tol, int ortho_strategy, - const int arnoldi_level, const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_2(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int N_team, const int team_size, const int vector_length, + const int N_iteration, const double tol, int ortho_strategy, + const int arnoldi_level, const int other_level, KrylovHandleType &handle) : _D(D), _diag(diag), _r(r), @@ -81,60 +79,41 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - using ScratchPadIntViewType = - Kokkos::View; - using ScratchPadValuesViewType = Kokkos::View< - typename ValuesViewType::non_const_value_type **, - typename ValuesViewType::array_layout, - typename ValuesViewType::execution_space::scratch_memory_space>; - using Operator = - KokkosBatched::CrsMatrix; - - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); - - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntViewType = Kokkos::View; + using ScratchPadValuesViewType = + Kokkos::View; + using Operator = KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); + + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); if (UsePrec) { - ScratchPadValuesViewType diag( - member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + ScratchPadValuesViewType diag(member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); using PrecOperator = KokkosBatched::JacobiPrec; KokkosBatched::TeamVectorCopy::invoke( - member, - Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL), - diag); + member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -143,10 +122,8 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -158,8 +135,8 @@ struct Functor_TestBatchedTeamVectorGMRES_2 { _handle.set_memory_strategy(1); - _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( - "", _X.extent(0), _X.extent(1) + maximum_iteration + 3); + _handle.tmp_view = + typename KrylovHandleType::TemporaryViewType("", _X.extent(0), _X.extent(1) + maximum_iteration + 3); using ScalarType = typename ValuesViewType::non_const_value_type; using Layout = typename ValuesViewType::array_layout; diff --git a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp index 013984b3d1..7c7d9103b2 100644 --- a/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp +++ b/perf_test/batched/sparse/GMRES/Functor_TestBatchedTeamVectorGMRES_3.hpp @@ -14,8 +14,8 @@ // //@HEADER -template +template struct Functor_TestBatchedTeamVectorGMRES_3 { const ValuesViewType _D; const ValuesViewType _diag; @@ -32,12 +32,11 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_3( - const ValuesViewType &D, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int N_team, - const int team_size, const int vector_length, const int N_iteration, - const double tol, const int ortho_strategy, const int arnoldi_level, - const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_3(const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, const int N_iteration, + const double tol, const int ortho_strategy, const int arnoldi_level, + const int other_level, KrylovHandleType &handle) : _D(D), _r(r), _c(c), @@ -54,12 +53,11 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { _handle(handle) {} KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorGMRES_3( - const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, - const IntView &c, const VectorViewType &X, const VectorViewType &B, - const int N_team, const int team_size, const int vector_length, - const int N_iteration, const double tol, int ortho_strategy, - const int arnoldi_level, const int other_level, KrylovHandleType &handle) + Functor_TestBatchedTeamVectorGMRES_3(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int N_team, const int team_size, const int vector_length, + const int N_iteration, const double tol, int ortho_strategy, + const int arnoldi_level, const int other_level, KrylovHandleType &handle) : _D(D), _diag(diag), _r(r), @@ -81,60 +79,41 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { const int first_matrix = _handle.first_index(member.league_rank()); const int last_matrix = _handle.last_index(member.league_rank()); - using TeamVectorCopy1D = - KokkosBatched::TeamVectorCopy; - - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); - - using ScratchPadIntViewType = - Kokkos::View; - using ScratchPadValuesViewType = Kokkos::View< - typename ValuesViewType::non_const_value_type **, - typename ValuesViewType::array_layout, - typename ValuesViewType::execution_space::scratch_memory_space>; - using Operator = - KokkosBatched::CrsMatrix; - - ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), - _r.extent(0) + _c.extent(0)); - - auto r = - Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); - auto c = Kokkos::subview( - tmp_1D_int, - Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntViewType = Kokkos::View; + using ScratchPadValuesViewType = + Kokkos::View; + using Operator = KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), _r.extent(0) + _c.extent(0)); + + auto r = Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview(tmp_1D_int, Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); TeamVectorCopy1D::invoke(member, _r, r); TeamVectorCopy1D::invoke(member, _c, c); Operator A(d, r, c); if (UsePrec) { - ScratchPadValuesViewType diag( - member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + ScratchPadValuesViewType diag(member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); using PrecOperator = KokkosBatched::JacobiPrec; KokkosBatched::TeamVectorCopy::invoke( - member, - Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL), - diag); + member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); PrecOperator P(diag); P.setComputedInverse(); - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType, PrecOperator, KrylovHandleType>( - member, A, b, x, P, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, P, _handle); } else { - KokkosBatched::TeamVectorGMRES::template invoke< - Operator, VectorViewType>(member, A, b, x, _handle); + KokkosBatched::TeamVectorGMRES::template invoke(member, A, b, x, _handle); } } @@ -143,10 +122,8 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), - Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), - _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -168,14 +145,13 @@ struct Functor_TestBatchedTeamVectorGMRES_3 { size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); - size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); + size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; - policy.set_scratch_size( - 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); exec_space().fence(); timer.reset(); diff --git a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp index c0ce8f0bd4..f69ccadd7e 100644 --- a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp +++ b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp @@ -82,60 +82,50 @@ int main(int argc, char *argv[]) { for (int i = 1; i < argc; ++i) { const std::string &token = argv[i]; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos Batched GMRES performance test options:" << std::endl - << "-A : Filename of the input batched matrix." - << std::endl - << "-B : Filename of the input batched right-hand " - "side." - << std::endl - << "-X : Filename of the output batched solution." - << std::endl - << "-res : Filename of the output residual history." - << std::endl - << "-timers : Filename of the output timers." - << std::endl - << "-ortho_strategy : Select the orthogonalization strategy." - << std::endl - << "-arnoldi_level : Select the scratch pad level (if used) " - "where Arnoldi vectors are stored." - << std::endl - << "-other_level : Select the scratch pad level (if used) " - "where everything except the Arnoldi vectors are stored." - << std::endl - << "-n1 : Number of repetitions of the experience." - << std::endl - << "-n2 : Number of the kernel calls inside one " - "experience." - << std::endl - << "-team_size : Used team size." << std::endl - << "-n_implementations: Number of implementations to use: test " - "all " - "implementations [0, specified number -1]." - << std::endl - << "-implementation : Specify only one implementation at a time." - << std::endl - << " Note: implementation 0 : does not use " - "scratch pad." - << std::endl - << " Note: implementation 1 : use scratch pad " - "for the graph and for the diagonal entries of the matrices." - << std::endl - << " Note: implementation 2 : use scratch pad " - "for the graph and for the diagonal entries of the matrices and " - "for the temporary variable but not for the Arnoldi vectors." - << std::endl - << "-l : Specify left layout." << std::endl - << "-r : Specify right layout." << std::endl - << "-P : Specify if a Jacobi preconditioner is " - "used." - << std::endl - << "-C : Specify if the convergence is monitored." - << std::endl - << "-N_team : Specify the number of systems per team." - << std::endl - << "-vector_length : Specify the vector length." << std::endl - << std::endl; + std::cout << "Kokkos Batched GMRES performance test options:" << std::endl + << "-A : Filename of the input batched matrix." << std::endl + << "-B : Filename of the input batched right-hand " + "side." + << std::endl + << "-X : Filename of the output batched solution." << std::endl + << "-res : Filename of the output residual history." << std::endl + << "-timers : Filename of the output timers." << std::endl + << "-ortho_strategy : Select the orthogonalization strategy." << std::endl + << "-arnoldi_level : Select the scratch pad level (if used) " + "where Arnoldi vectors are stored." + << std::endl + << "-other_level : Select the scratch pad level (if used) " + "where everything except the Arnoldi vectors are stored." + << std::endl + << "-n1 : Number of repetitions of the experience." << std::endl + << "-n2 : Number of the kernel calls inside one " + "experience." + << std::endl + << "-team_size : Used team size." << std::endl + << "-n_implementations: Number of implementations to use: test " + "all " + "implementations [0, specified number -1]." + << std::endl + << "-implementation : Specify only one implementation at a time." << std::endl + << " Note: implementation 0 : does not use " + "scratch pad." + << std::endl + << " Note: implementation 1 : use scratch pad " + "for the graph and for the diagonal entries of the matrices." + << std::endl + << " Note: implementation 2 : use scratch pad " + "for the graph and for the diagonal entries of the matrices and " + "for the temporary variable but not for the Arnoldi vectors." + << std::endl + << "-l : Specify left layout." << std::endl + << "-r : Specify right layout." << std::endl + << "-P : Specify if a Jacobi preconditioner is " + "used." + << std::endl + << "-C : Specify if the convergence is monitored." << std::endl + << "-N_team : Specify the number of systems per team." << std::endl + << "-vector_length : Specify the vector length." << std::endl + << std::endl; return 0; } if (token == std::string("-A")) name_A = argv[++i]; @@ -143,26 +133,18 @@ int main(int argc, char *argv[]) { if (token == std::string("-X")) name_X = argv[++i]; if (token == std::string("-res")) name_conv = argv[++i]; if (token == std::string("-timers")) name_timer = argv[++i]; - if (token == std::string("-ortho_strategy")) - ortho_strategy = std::atoi(argv[++i]); - if (token == std::string("-arnoldi_level")) - arnoldi_level = std::atoi(argv[++i]); - if (token == std::string("-other_level")) - other_level = std::atoi(argv[++i]); + if (token == std::string("-ortho_strategy")) ortho_strategy = std::atoi(argv[++i]); + if (token == std::string("-arnoldi_level")) arnoldi_level = std::atoi(argv[++i]); + if (token == std::string("-other_level")) other_level = std::atoi(argv[++i]); if (token == std::string("-n1")) n_rep_1 = std::atoi(argv[++i]); if (token == std::string("-n2")) n_rep_2 = std::atoi(argv[++i]); - if (token == std::string("-n_iterations")) - n_iterations = std::atoi(argv[++i]); + if (token == std::string("-n_iterations")) n_iterations = std::atoi(argv[++i]); if (token == std::string("-tol")) tol = std::stod(argv[++i]); if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-N_team")) - N_team_potential = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-N_team")) N_team_potential = std::atoi(argv[++i]); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -179,9 +161,8 @@ int main(int argc, char *argv[]) { readSizesFromMM(name_A, Blk, ncols, nnz, N); - std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk - << ", N = " << N << ", team_size = " << team_size - << ", vector_length = " << vector_length << std::endl; + std::cout << "N_team_potential = " << N_team_potential << ", n = " << Blk << ", N = " << N + << ", team_size = " << team_size << ", vector_length = " << vector_length << std::endl; if (impls.size() == 0) for (int i = 0; i < n_impl; ++i) impls.push_back(i); @@ -190,9 +171,7 @@ int main(int argc, char *argv[]) { constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf( - " :::: GMRES Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", - N, Blk, nnz, vector_length, n_rep_1); + printf(" :::: GMRES Testing (N = %d, Blk = %d, nnz = %d, vl = %d, n = %d)\n", N, Blk, nnz, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -221,22 +200,18 @@ int main(int argc, char *argv[]) { XYTypeLL xLL("values", N, Blk); XYTypeLL yLL("values", N, Blk); - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); readArrayFromMM(name_B, yLL); - if (use_preconditioner) - getInvDiagFromCRS(valuesLL, rowOffsets, colIndices, diagLL); + if (use_preconditioner) getInvDiagFromCRS(valuesLL, rowOffsets, colIndices, diagLL); } if (layout_right) { readCRSFromMM(name_A, valuesLR, rowOffsets, colIndices); readArrayFromMM(name_B, yLR); - if (use_preconditioner) - getInvDiagFromCRS(valuesLR, rowOffsets, colIndices, diagLR); + if (use_preconditioner) getInvDiagFromCRS(valuesLR, rowOffsets, colIndices, diagLR); } for (auto i_impl : impls) { @@ -256,12 +231,9 @@ int main(int argc, char *argv[]) { using Scalar3DViewType = Kokkos::View; using IntViewType = Kokkos::View; - using KrylovHandleType = - KokkosBatched::KrylovHandle; + using KrylovHandleType = KokkosBatched::KrylovHandle; KrylovHandleType handle(N, N_team, n_iterations, true); - handle.Arnoldi_view = - Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); + handle.Arnoldi_view = Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); // handle.tmp_view = typename KrylovHandleType::TemporaryViewType( // "", N, Blk + n_iterations + 3); @@ -285,110 +257,86 @@ int main(int argc, char *argv[]) { if (i_impl == 0 && layout_left) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, true>( - valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, false>( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 1 && layout_left) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, true>( - valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, false>( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 2 && layout_left) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, true>( - valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLL, diagLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLL, IntView, XYTypeLL, - KrylovHandleType, false>( - valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLL, rowOffsets, colIndices, xLL, yLL, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 0 && layout_right) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, true>( - valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_1< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, false>( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_1( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 1 && layout_right) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, true>( - valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_2< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, false>( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_2( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } if (i_impl == 2 && layout_right) { if (use_preconditioner) - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, true>( - valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, - N_team, team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLR, diagLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, + n_iterations, tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); else - t_spmv += Functor_TestBatchedTeamVectorGMRES_3< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR, - KrylovHandleType, false>( - valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, - team_size, vector_length, n_iterations, tol, - ortho_strategy, arnoldi_level, other_level, handle) + t_spmv += Functor_TestBatchedTeamVectorGMRES_3( + valuesLR, rowOffsets, colIndices, xLR, yLR, N_team, team_size, vector_length, n_iterations, + tol, ortho_strategy, arnoldi_level, other_level, handle) .run(); } exec_space().fence(); @@ -403,10 +351,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -419,15 +365,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); @@ -436,8 +377,7 @@ int main(int argc, char *argv[]) { writeArrayToMM(name_X + std::to_string(i_impl) + "_r.mm", xLR); } if (monitor_convergence) { - writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", - handle.residual_norms); + writeArrayToMM(name_conv + std::to_string(i_impl) + ".mm", handle.residual_norms); } } } diff --git a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp index 1eaacbde5e..53f1c48f6c 100644 --- a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp +++ b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp @@ -36,11 +36,9 @@ void writeArrayToMM(std::string name, const XType x) { myfile.close(); } -void readSizesFromMM(std::string name, int &nrows, int &ncols, int &nnz, - int &N) { +void readSizesFromMM(std::string name, int &nrows, int &ncols, int &nnz, int &N) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); std::string line_sizes; @@ -67,8 +65,7 @@ template void readArrayFromMM(std::string name, const XType &x) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); input.ignore(std::numeric_limits::max(), '\n'); typename XType::HostMirror x_h = Kokkos::create_mirror_view(x); @@ -85,8 +82,7 @@ template void readDenseFromMM(std::string name, const AType &A) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); input.ignore(std::numeric_limits::max(), '\n'); typename AType::HostMirror A_h = Kokkos::create_mirror_view(A); @@ -113,12 +109,10 @@ void readDenseFromMM(std::string name, const AType &A) { } template -void readCRSFromMM(std::string name, const VType &V, const IntType &r, - const IntType &c) { +void readCRSFromMM(std::string name, const VType &V, const IntType &r, const IntType &c) { std::ifstream input(name); - while (input.peek() == '%') - input.ignore(std::numeric_limits::max(), '\n'); + while (input.peek() == '%') input.ignore(std::numeric_limits::max(), '\n'); input.ignore(std::numeric_limits::max(), '\n'); typename VType::HostMirror V_h = Kokkos::create_mirror_view(V); @@ -137,8 +131,7 @@ void readCRSFromMM(std::string name, const VType &V, const IntType &r, input >> read_row >> c_h(i); --read_row; --c_h(i); - for (int tmp_row = current_row + 1; tmp_row <= read_row; ++tmp_row) - r_h(tmp_row) = i; + for (int tmp_row = current_row + 1; tmp_row <= read_row; ++tmp_row) r_h(tmp_row) = i; current_row = read_row; // if (VType::rank == 1) @@ -157,8 +150,7 @@ void readCRSFromMM(std::string name, const VType &V, const IntType &r, } template -void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, - const VType &diag) { +void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, const VType &diag) { auto diag_values_host = Kokkos::create_mirror_view(diag); auto values_host = Kokkos::create_mirror_view(V); auto row_ptr_host = Kokkos::create_mirror_view(r); @@ -173,12 +165,10 @@ void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, int BlkSize = diag.extent(1); for (int i = 0; i < BlkSize; ++i) { - for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); - ++current_index) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); ++current_index) { if (colIndices_host(current_index) == i) break; } - for (int j = 0; j < N; ++j) - diag_values_host(j, i) = 1. / values_host(j, current_index); + for (int j = 0; j < N; ++j) diag_values_host(j, i) = 1. / values_host(j, current_index); } Kokkos::deep_copy(diag, diag_values_host); diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp index 17b8ad6d3e..c1cdec2778 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp @@ -14,8 +14,7 @@ // //@HEADER -template +template struct BSPMV_Functor_View { typedef typename AMatrix::execution_space exec_space; typedef typename AMatrix::non_const_value_type value_type; @@ -36,11 +35,9 @@ struct BSPMV_Functor_View { const int N; int implementation; - BSPMV_Functor_View(const value_type* alpha_, const AMatrix m_A_values_, - const IntView m_A_row_ptr_, const IntView m_A_col_indices_, - const XVector m_x_, const value_type* beta_, - const YVector m_y_, const int matrices_per_team_, - const int N_, const int implementation_ = 0) + BSPMV_Functor_View(const value_type* alpha_, const AMatrix m_A_values_, const IntView m_A_row_ptr_, + const IntView m_A_col_indices_, const XVector m_x_, const value_type* beta_, const YVector m_y_, + const int matrices_per_team_, const int N_, const int implementation_ = 0) : alpha(alpha_), m_A_values(m_A_values_), m_A_row_ptr(m_A_row_ptr_), @@ -51,23 +48,16 @@ struct BSPMV_Functor_View { matrices_per_team(matrices_per_team_), N(N_), implementation(implementation_) { - static_assert(static_cast(AMatrix::rank) == 2, - "AMatrix must be a rank 2 View."); - static_assert(static_cast(IntView::rank) == 1, - "IntView must be a rank 1 View."); - static_assert(static_cast(XVector::rank) == 2, - "XVector must be a rank 2 View."); - static_assert(static_cast(YVector::rank) == 2, - "YVector must be a rank 2 View."); + static_assert(static_cast(AMatrix::rank) == 2, "AMatrix must be a rank 2 View."); + static_assert(static_cast(IntView::rank) == 1, "IntView must be a rank 1 View."); + static_assert(static_cast(XVector::rank) == 2, "XVector must be a rank 2 View."); + static_assert(static_cast(YVector::rank) == 2, "YVector must be a rank 2 View."); } - KOKKOS_INLINE_FUNCTION void getIndices(const ordinal_type iTemp, - const ordinal_type n_rows, - const ordinal_type n_matrices, - ordinal_type& iRow, + KOKKOS_INLINE_FUNCTION void getIndices(const ordinal_type iTemp, const ordinal_type n_rows, + const ordinal_type n_matrices, ordinal_type& iRow, ordinal_type& iMatrix) const { - if (std::is_same::value) { + if (std::is_same::value) { iRow = iTemp / n_matrices; iMatrix = iTemp % n_matrices; } else { @@ -78,90 +68,72 @@ struct BSPMV_Functor_View { KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const { if (implementation == 0) { - const int first_matrix = - static_cast(dev.league_rank()) * matrices_per_team; - const int last_matrix = - static_cast(dev.league_rank() + 1) * matrices_per_team < N - ? static_cast(dev.league_rank() + 1) * matrices_per_team - : N; + const int first_matrix = static_cast(dev.league_rank()) * matrices_per_team; + const int last_matrix = static_cast(dev.league_rank() + 1) * matrices_per_team < N + ? static_cast(dev.league_rank() + 1) * matrices_per_team + : N; const ordinal_type n_rows = m_A_row_ptr.extent(0) - 1; for (int i_matrix = first_matrix; i_matrix < last_matrix; ++i_matrix) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange(dev, 0, n_rows), - [&](const ordinal_type& iRow) { - const ordinal_type row_length = - m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); - value_type sum = 0; - - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(dev, row_length), - [&](const ordinal_type& iEntry, value_type& lsum) { - const value_type val = - m_A_values(i_matrix, m_A_row_ptr(iRow) + iEntry); - lsum += - val * m_x(i_matrix, - m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); - }, - sum); - - Kokkos::single(Kokkos::PerThread(dev), [&]() { - sum *= alpha[i_matrix]; - - if (dobeta == 0) { - m_y(i_matrix, iRow) = sum; - } else { - m_y(i_matrix, iRow) = - beta[i_matrix] * m_y(i_matrix, iRow) + sum; - } - }); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, n_rows), [&](const ordinal_type& iRow) { + const ordinal_type row_length = m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); + value_type sum = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(dev, row_length), + [&](const ordinal_type& iEntry, value_type& lsum) { + const value_type val = m_A_values(i_matrix, m_A_row_ptr(iRow) + iEntry); + lsum += val * m_x(i_matrix, m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); + }, + sum); + + Kokkos::single(Kokkos::PerThread(dev), [&]() { + sum *= alpha[i_matrix]; + + if (dobeta == 0) { + m_y(i_matrix, iRow) = sum; + } else { + m_y(i_matrix, iRow) = beta[i_matrix] * m_y(i_matrix, iRow) + sum; + } + }); + }); } } if (implementation == 1) { - const int first_matrix = - static_cast(dev.league_rank()) * matrices_per_team; - const int last_matrix = - static_cast(dev.league_rank() + 1) * matrices_per_team < N - ? static_cast(dev.league_rank() + 1) * matrices_per_team - : N; + const int first_matrix = static_cast(dev.league_rank()) * matrices_per_team; + const int last_matrix = static_cast(dev.league_rank() + 1) * matrices_per_team < N + ? static_cast(dev.league_rank() + 1) * matrices_per_team + : N; const int n_matrices = last_matrix - first_matrix; const ordinal_type n_rows = m_A_row_ptr.extent(0) - 1; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), - [&](const ordinal_type& iTemp) { - ordinal_type iRow, iMatrix; - this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); - const int iGlobalMatrix = first_matrix + iMatrix; + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), [&](const ordinal_type& iTemp) { + ordinal_type iRow, iMatrix; + this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); + const int iGlobalMatrix = first_matrix + iMatrix; - const ordinal_type row_length = - m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); - value_type sum = 0; + const ordinal_type row_length = m_A_row_ptr(iRow + 1) - m_A_row_ptr(iRow); + value_type sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int iEntry = 0; iEntry < row_length; ++iEntry) { - sum += m_A_values(iGlobalMatrix, m_A_row_ptr(iRow) + iEntry) * - m_x(iGlobalMatrix, - m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); - } - - sum *= alpha[iGlobalMatrix]; - - if (dobeta == 0) { - m_y(iGlobalMatrix, iRow) = sum; - } else { - m_y(iGlobalMatrix, iRow) = - beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; - } - }); + for (int iEntry = 0; iEntry < row_length; ++iEntry) { + sum += m_A_values(iGlobalMatrix, m_A_row_ptr(iRow) + iEntry) * + m_x(iGlobalMatrix, m_A_col_indices(m_A_row_ptr(iRow) + iEntry)); + } + + sum *= alpha[iGlobalMatrix]; + + if (dobeta == 0) { + m_y(iGlobalMatrix, iRow) = sum; + } else { + m_y(iGlobalMatrix, iRow) = beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; + } + }); } if (implementation == 2) { - using ScratchPadIntView = - Kokkos::View; + using ScratchPadIntView = Kokkos::View; const ordinal_type n_rows = m_A_row_ptr.extent(0) - 1; const ordinal_type nnz = m_A_col_indices.extent(0); @@ -169,51 +141,43 @@ struct BSPMV_Functor_View { ScratchPadIntView cols(dev.team_scratch(0), nnz); ScratchPadIntView row_map(dev.team_scratch(0), n_rows + 1); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, n_rows + 1), - [&](const ordinal_type& i) { row_map(i) = m_A_row_ptr(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, n_rows + 1), + [&](const ordinal_type& i) { row_map(i) = m_A_row_ptr(i); }); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, nnz), - [&](const ordinal_type& i) { cols(i) = m_A_col_indices(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, nnz), + [&](const ordinal_type& i) { cols(i) = m_A_col_indices(i); }); dev.team_barrier(); - const int first_matrix = - static_cast(dev.league_rank()) * matrices_per_team; - const int last_matrix = - static_cast(dev.league_rank() + 1) * matrices_per_team < N - ? static_cast(dev.league_rank() + 1) * matrices_per_team - : N; - const int n_matrices = last_matrix - first_matrix; + const int first_matrix = static_cast(dev.league_rank()) * matrices_per_team; + const int last_matrix = static_cast(dev.league_rank() + 1) * matrices_per_team < N + ? static_cast(dev.league_rank() + 1) * matrices_per_team + : N; + const int n_matrices = last_matrix - first_matrix; - Kokkos::parallel_for( - Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), - [&](const ordinal_type& iTemp) { - ordinal_type iRow, iMatrix; - this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); - const int iGlobalMatrix = first_matrix + iMatrix; + Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, n_rows * n_matrices), [&](const ordinal_type& iTemp) { + ordinal_type iRow, iMatrix; + this->getIndices(iTemp, n_rows, n_matrices, iRow, iMatrix); + const int iGlobalMatrix = first_matrix + iMatrix; - const ordinal_type row_length = row_map(iRow + 1) - row_map(iRow); - value_type sum = 0; + const ordinal_type row_length = row_map(iRow + 1) - row_map(iRow); + value_type sum = 0; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif - for (int iEntry = 0; iEntry < row_length; ++iEntry) { - sum += m_A_values(iGlobalMatrix, row_map(iRow) + iEntry) * - m_x(iGlobalMatrix, cols(row_map(iRow) + iEntry)); - } - - sum *= alpha[iGlobalMatrix]; - - if (dobeta == 0) { - m_y(iGlobalMatrix, iRow) = sum; - } else { - m_y(iGlobalMatrix, iRow) = - beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; - } - }); + for (int iEntry = 0; iEntry < row_length; ++iEntry) { + sum += m_A_values(iGlobalMatrix, row_map(iRow) + iEntry) * m_x(iGlobalMatrix, cols(row_map(iRow) + iEntry)); + } + + sum *= alpha[iGlobalMatrix]; + + if (dobeta == 0) { + m_y(iGlobalMatrix, iRow) = sum; + } else { + m_y(iGlobalMatrix, iRow) = beta[iGlobalMatrix] * m_y(iGlobalMatrix, iRow) + sum; + } + }); } } }; \ No newline at end of file diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp index 06ea55e303..e93c65f7f9 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp @@ -31,9 +31,8 @@ typedef typename exec_space::memory_space memory_space; typedef Kokkos::DefaultHostExecutionSpace host_space; typedef typename Kokkos::Device device; -template +template struct Functor_TestBatchedTeamVectorSpmv { PolicyType _policy; const alphaViewType _alpha; @@ -46,10 +45,9 @@ struct Functor_TestBatchedTeamVectorSpmv { int _matrices_per_team; KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamVectorSpmv( - PolicyType policy, const alphaViewType &alpha, const DViewType &D, - const IntView &r, const IntView &c, const xViewType &X, - const betaViewType &beta, const yViewType &Y, const int matrices_per_team) + Functor_TestBatchedTeamVectorSpmv(PolicyType policy, const alphaViewType &alpha, const DViewType &D, const IntView &r, + const IntView &c, const xViewType &X, const betaViewType &beta, const yViewType &Y, + const int matrices_per_team) : _policy(policy), _alpha(alpha), _D(D), @@ -62,28 +60,19 @@ struct Functor_TestBatchedTeamVectorSpmv { template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { - const int first_matrix = - static_cast(member.league_rank()) * _matrices_per_team; - const int N = _D.extent(0); - const int last_matrix = - (static_cast(member.league_rank() + 1) * _matrices_per_team < N - ? static_cast(member.league_rank() + 1) * _matrices_per_team - : N); - - auto alpha_team = - Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); - auto D_team = Kokkos::subview( - _D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto X_team = Kokkos::subview( - _X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - auto beta_team = - Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); - auto Y_team = Kokkos::subview( - _Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - - using ScratchPadIntView = - Kokkos::View; + const int first_matrix = static_cast(member.league_rank()) * _matrices_per_team; + const int N = _D.extent(0); + const int last_matrix = (static_cast(member.league_rank() + 1) * _matrices_per_team < N + ? static_cast(member.league_rank() + 1) * _matrices_per_team + : N); + + auto alpha_team = Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix)); + auto D_team = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto X_team = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto beta_team = Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix)); + auto Y_team = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + + using ScratchPadIntView = Kokkos::View; const int n_rows = _r.extent(0) - 1; const int nnz = _c.extent(0); @@ -91,31 +80,23 @@ struct Functor_TestBatchedTeamVectorSpmv { ScratchPadIntView cols(member.team_scratch(0), nnz); ScratchPadIntView row_map(member.team_scratch(0), n_rows + 1); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, n_rows + 1), - [&](const int &i) { row_map(i) = _r(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, n_rows + 1), [&](const int &i) { row_map(i) = _r(i); }); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, nnz), - [&](const int &i) { cols(i) = _c(i); }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, nnz), [&](const int &i) { cols(i) = _c(i); }); member.team_barrier(); if (last_matrix != N && _matrices_per_team == 8) - KokkosBatched::TeamVectorSpmv< - MemberType, KokkosBatched::Trans::NoTranspose, - 8>::template invoke( + KokkosBatched::TeamVectorSpmv::template invoke< + DViewType, ScratchPadIntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>( member, alpha_team, D_team, row_map, cols, X_team, beta_team, Y_team); else - KokkosBatched::TeamVectorSpmv< - MemberType, KokkosBatched::Trans::NoTranspose, - 1>::template invoke( + KokkosBatched::TeamVectorSpmv::template invoke< + DViewType, ScratchPadIntView, xViewType, yViewType, alphaViewType, betaViewType, dobeta>( member, alpha_team, D_team, row_map, cols, X_team, beta_team, Y_team); } - inline void run() { - Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", _policy, *this); - } + inline void run() { Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", _policy, *this); } }; int main(int argc, char *argv[]) { @@ -151,53 +132,46 @@ int main(int argc, char *argv[]) { for (int i = 1; i < argc; ++i) { const std::string &token = argv[i]; if (token == std::string("--help") || token == std::string("-h")) { - std::cout - << "Kokkos Batched SPMV performance test options:" << std::endl - << "-A : Filename of the input batched matrix." - << std::endl - << "-B : Filename of the input batched right-hand " - "side." - << std::endl - << "-X : Filename of the output batched solution." - << std::endl - << "-timers : Filename of the output timers." - << std::endl - << "-n1 : Number of repetitions of the experience." - << std::endl - << "-n2 : Number of the kernel calls inside one " - "experience." - << std::endl - << "-team_size : Used team size." << std::endl - << "-n_implementations: Number of implementations to use: test " - "all " - "implementations [0, specified number -1]." - << std::endl - << "-implementation : Specify only one implementation at a time." - << std::endl - << " Note: implementation 0 : use a Team " - "approach where a Team have to apply N_team SPMV. A given team " - "applies N_team SPMV sequentially and uses a ThreadRange over " - "the row and a VectorRange over the non zero entries of a given " - "row." - << std::endl - << " Note: implementation 1 : use a Team " - "approach where a Team have to apply N_team SPMV. A given team " - "uses a fused thread vector range policy to loop over the " - "independent fibers." - << std::endl - << " Note: implementation 2 : same as " - "implementation 1 but using scratch pad for the graph." - << std::endl - << " Note: implementation 3 : same as " - "implementation 1 but using the kernels from " - "batched/sparse/impl/*." - << std::endl - << "-l : Specify left layout." << std::endl - << "-r : Specify right layout." << std::endl - << "-N_team : Specify the number of systems per team." - << std::endl - << "-vector_length : Specify the vector length." << std::endl - << std::endl; + std::cout << "Kokkos Batched SPMV performance test options:" << std::endl + << "-A : Filename of the input batched matrix." << std::endl + << "-B : Filename of the input batched right-hand " + "side." + << std::endl + << "-X : Filename of the output batched solution." << std::endl + << "-timers : Filename of the output timers." << std::endl + << "-n1 : Number of repetitions of the experience." << std::endl + << "-n2 : Number of the kernel calls inside one " + "experience." + << std::endl + << "-team_size : Used team size." << std::endl + << "-n_implementations: Number of implementations to use: test " + "all " + "implementations [0, specified number -1]." + << std::endl + << "-implementation : Specify only one implementation at a time." << std::endl + << " Note: implementation 0 : use a Team " + "approach where a Team have to apply N_team SPMV. A given team " + "applies N_team SPMV sequentially and uses a ThreadRange over " + "the row and a VectorRange over the non zero entries of a given " + "row." + << std::endl + << " Note: implementation 1 : use a Team " + "approach where a Team have to apply N_team SPMV. A given team " + "uses a fused thread vector range policy to loop over the " + "independent fibers." + << std::endl + << " Note: implementation 2 : same as " + "implementation 1 but using scratch pad for the graph." + << std::endl + << " Note: implementation 3 : same as " + "implementation 1 but using the kernels from " + "batched/sparse/impl/*." + << std::endl + << "-l : Specify left layout." << std::endl + << "-r : Specify right layout." << std::endl + << "-N_team : Specify the number of systems per team." << std::endl + << "-vector_length : Specify the vector length." << std::endl + << std::endl; return 0; } if (token == std::string("-A")) name_A = argv[++i]; @@ -209,15 +183,11 @@ int main(int argc, char *argv[]) { if (token == std::string("-n1")) n_rep_1 = std::atoi(argv[++i]); if (token == std::string("-n2")) n_rep_2 = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-N_team")) - N_team_potential = std::atoi(argv[++i]); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-N_team")) N_team_potential = std::atoi(argv[++i]); if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -244,8 +214,7 @@ int main(int argc, char *argv[]) { printf( " :::: Testing (N = %d, Blk = %d, nnz = %d, vl = %d, vi = %d, n = " "%d, N_team_potential = %d)\n", - N, Blk, nnz, vector_length, internal_vector_length, n_rep_1, - N_team_potential); + N, Blk, nnz, vector_length, internal_vector_length, n_rep_1, N_team_potential); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -274,10 +243,8 @@ int main(int argc, char *argv[]) { double *s_a = new double[N]; double *s_b = new double[N]; - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); @@ -301,8 +268,7 @@ int main(int argc, char *argv[]) { Kokkos::deep_copy(alphaV, alphaV_h); Kokkos::deep_copy(betaV, betaV_h); - using ScratchPadIntView = - Kokkos::View; + using ScratchPadIntView = Kokkos::View; for (auto i_impl : impls) { std::vector timers; @@ -327,12 +293,9 @@ int main(int argc, char *argv[]) { if (layout_left) { using policy_type = Kokkos::TeamPolicy; - policy_type auto_policy(number_of_teams, Kokkos::AUTO(), - Kokkos::AUTO()); - policy_type tuned_policy(number_of_teams, team_size, - Kokkos::AUTO()); - policy_type tuned_policy_2(number_of_teams, team_size, - vector_length); + policy_type auto_policy(number_of_teams, Kokkos::AUTO(), Kokkos::AUTO()); + policy_type tuned_policy(number_of_teams, team_size, Kokkos::AUTO()); + policy_type tuned_policy_2(number_of_teams, team_size, vector_length); policy_type policy; if (team_size < 1) @@ -347,33 +310,24 @@ int main(int argc, char *argv[]) { size_t bytes_0 = ScratchPadIntView::shmem_size(Blk + 1); size_t bytes_1 = ScratchPadIntView::shmem_size(nnz); - if (i_impl > 1) - policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); + if (i_impl > 1) policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); // policy.set_scratch_size(1, Kokkos::PerTeam(bytes_1)); if (i_impl == 3) { - Functor_TestBatchedTeamVectorSpmv< - policy_type, AMatrixValueViewLL, IntView, XYTypeLL, XYTypeLL, - alphaViewType, alphaViewType, 0>(policy, alphaV, valuesLL, - rowOffsets, colIndices, xLL, - betaV, yLL, N_team) + Functor_TestBatchedTeamVectorSpmv(policy, alphaV, valuesLL, rowOffsets, + colIndices, xLL, betaV, yLL, N_team) .run(); } else { - Kokkos::parallel_for( - "KokkosSparse::PerfTest::BSpMV", policy, - BSPMV_Functor_View(s_a, valuesLL, rowOffsets, - colIndices, xLL, s_b, yLL, - N_team, N, i_impl)); + Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", policy, + BSPMV_Functor_View( + s_a, valuesLL, rowOffsets, colIndices, xLL, s_b, yLL, N_team, N, i_impl)); } } if (layout_right) { using policy_type = Kokkos::TeamPolicy; - policy_type auto_policy(number_of_teams, Kokkos::AUTO(), - Kokkos::AUTO()); - policy_type tuned_policy(number_of_teams, team_size, - Kokkos::AUTO()); - policy_type tuned_policy_2(number_of_teams, team_size, - vector_length); + policy_type auto_policy(number_of_teams, Kokkos::AUTO(), Kokkos::AUTO()); + policy_type tuned_policy(number_of_teams, team_size, Kokkos::AUTO()); + policy_type tuned_policy_2(number_of_teams, team_size, vector_length); policy_type policy; if (team_size < 1) @@ -385,23 +339,17 @@ int main(int argc, char *argv[]) { size_t bytes_0 = ScratchPadIntView::shmem_size(Blk + 1); size_t bytes_1 = ScratchPadIntView::shmem_size(nnz); - if (i_impl > 1) - policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); + if (i_impl > 1) policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0 + bytes_1)); // policy.set_scratch_size(1, Kokkos::PerTeam(bytes_1)); if (i_impl == 3) { - Functor_TestBatchedTeamVectorSpmv< - policy_type, AMatrixValueViewLR, IntView, XYTypeLR, XYTypeLR, - alphaViewType, alphaViewType, 0>(policy, alphaV, valuesLR, - rowOffsets, colIndices, xLR, - betaV, yLR, N_team) + Functor_TestBatchedTeamVectorSpmv(policy, alphaV, valuesLR, rowOffsets, + colIndices, xLR, betaV, yLR, N_team) .run(); } else { - Kokkos::parallel_for( - "KokkosSparse::PerfTest::BSpMV", policy, - BSPMV_Functor_View(s_a, valuesLR, rowOffsets, - colIndices, xLR, s_b, yLR, - N_team, N, i_impl)); + Kokkos::parallel_for("KokkosSparse::PerfTest::BSpMV", policy, + BSPMV_Functor_View( + s_a, valuesLR, rowOffsets, colIndices, xLR, s_b, yLR, N_team, N, i_impl)); } } exec_space().fence(); @@ -416,10 +364,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -432,8 +378,7 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); if (layout_left) printf( diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp index 2294c23805..5e9bf13f8c 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp @@ -71,9 +71,7 @@ struct Functor_Test_BatchedDenseCuSolve { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_BatchedDenseCuSolve(const MatrixViewType &A, - const VectorViewType &X, - const VectorViewType &B) + Functor_Test_BatchedDenseCuSolve(const MatrixViewType &A, const VectorViewType &X, const VectorViewType &B) : _A(A), _X(X), _B(B) {} inline double run() { @@ -100,10 +98,8 @@ struct Functor_Test_BatchedDenseCuSolve { double **d_Aarray = nullptr; double **d_Barray = nullptr; - cudaMalloc(reinterpret_cast(&d_Aarray), - sizeof(double *) * batchSize); - cudaMalloc(reinterpret_cast(&d_Barray), - sizeof(double *) * batchSize); + cudaMalloc(reinterpret_cast(&d_Aarray), sizeof(double *) * batchSize); + cudaMalloc(reinterpret_cast(&d_Barray), sizeof(double *) * batchSize); std::vector Aarray(batchSize, nullptr); std::vector Barray(batchSize, nullptr); @@ -112,34 +108,26 @@ struct Functor_Test_BatchedDenseCuSolve { Barray[i] = Kokkos::subview(_X, i, Kokkos::ALL).data(); } - cudaMemcpyAsync(d_Aarray, Aarray.data(), sizeof(double *) * batchSize, - cudaMemcpyHostToDevice); - cudaMemcpyAsync(d_Barray, Barray.data(), sizeof(double *) * batchSize, - cudaMemcpyHostToDevice); + cudaMemcpyAsync(d_Aarray, Aarray.data(), sizeof(double *) * batchSize, cudaMemcpyHostToDevice); + cudaMemcpyAsync(d_Barray, Barray.data(), sizeof(double *) * batchSize, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); exec_space().fence(); timer.reset(); - auto status1 = cusolverDnDpotrfBatched(handle, uplo, m, d_Aarray, lda, - d_infoArray, batchSize); + auto status1 = cusolverDnDpotrfBatched(handle, uplo, m, d_Aarray, lda, d_infoArray, batchSize); if (status1 != CUSOLVER_STATUS_SUCCESS) - std::cout << "Error in cusolverDnDpotrfBatched with batchSize = " - << batchSize << " and m = " << m << std::endl; + std::cout << "Error in cusolverDnDpotrfBatched with batchSize = " << batchSize << " and m = " << m << std::endl; cudaDeviceSynchronize(); - auto status2 = cusolverDnDpotrsBatched(handle, uplo, m, 1, d_Aarray, lda, - d_Barray, ldb, info, batchSize); + auto status2 = cusolverDnDpotrsBatched(handle, uplo, m, 1, d_Aarray, lda, d_Barray, ldb, info, batchSize); if (status2 != CUSOLVER_STATUS_SUCCESS) { if (status2 == CUSOLVER_STATUS_NOT_INITIALIZED) - std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " - << batchSize << " and m = " << m + std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " << batchSize << " and m = " << m << " CUSOLVER_STATUS_NOT_INITIALIZED " << std::endl; if (status2 == CUSOLVER_STATUS_INVALID_VALUE) - std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " - << batchSize << " and m = " << m + std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " << batchSize << " and m = " << m << " CUSOLVER_STATUS_INVALID_VALUE " << std::endl; if (status2 == CUSOLVER_STATUS_INTERNAL_ERROR) - std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " - << batchSize << " and m = " << m + std::cout << "Error in cusolverDnDpotrsBatched with batchSize = " << batchSize << " and m = " << m << " CUSOLVER_STATUS_INTERNAL_ERROR " << std::endl; cudaDeviceSynchronize(); exec_space().fence(); @@ -189,12 +177,9 @@ int main(int argc, char *argv[]) { if (token == std::string("-X")) name_X = argv[++i]; if (token == std::string("-timers")) name_timer = argv[++i]; if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -219,8 +204,7 @@ int main(int argc, char *argv[]) { constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf(" :::: CusolverDn Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, - Blk, vector_length, n_rep_1); + printf(" :::: CusolverDn Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, Blk, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -240,10 +224,8 @@ int main(int argc, char *argv[]) { XYTypeLL xLL("values", N, Blk); XYTypeLL yLL("values", N, Blk); - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readDenseFromMM(name_A, aLL); @@ -269,9 +251,7 @@ int main(int argc, char *argv[]) { if (i_impl == 0) { if (layout_right) { - t_spmv = Functor_Test_BatchedDenseCuSolve(aLR, xLR, yLR) - .run(); + t_spmv = Functor_Test_BatchedDenseCuSolve(aLR, xLR, yLR).run(); } } exec_space().fence(); @@ -285,10 +265,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -301,15 +279,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp index 808e235edc..8b2b48c0f4 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp @@ -26,7 +26,7 @@ #include "Kokkos_Sort.hpp" // -//#define KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +// #define KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -66,8 +66,7 @@ typedef typename exec_space::memory_space memory_space; typedef Kokkos::DefaultHostExecutionSpace host_space; typedef typename Kokkos::Device device; -template +template struct Functor_Test_SparseCuSolveQR { const MatrixViewType _A; const IntView _r; @@ -76,8 +75,7 @@ struct Functor_Test_SparseCuSolveQR { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_Test_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} @@ -94,10 +92,8 @@ struct Functor_Test_SparseCuSolveQR { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -110,10 +106,8 @@ struct Functor_Test_SparseCuSolveQR { auto b = Kokkos::subview(_B, i, Kokkos::ALL).data(); auto x = Kokkos::subview(_X, i, Kokkos::ALL).data(); - cusolverSpDcsrlsvqr(handle, m, nnz, descrA, csrValA, _r.data(), _c.data(), - b, tol, reorder, x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + cusolverSpDcsrlsvqr(handle, m, nnz, descrA, csrValA, _r.data(), _c.data(), b, tol, reorder, x, singularity); + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; } exec_space().fence(); @@ -124,8 +118,7 @@ struct Functor_Test_SparseCuSolveQR { } }; -template +template struct Functor_Test_Block_SparseCuSolveQR { const MatrixViewType _A; const IntView _r; @@ -134,9 +127,8 @@ struct Functor_Test_Block_SparseCuSolveQR { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_Block_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, - const IntView &c, const VectorViewType &X, - const VectorViewType &B) + Functor_Test_Block_SparseCuSolveQR(const MatrixViewType &A, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} inline double run() { @@ -155,10 +147,8 @@ struct Functor_Test_Block_SparseCuSolveQR { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -180,15 +170,12 @@ struct Functor_Test_Block_SparseCuSolveQR { rowOffsets_host(0) = 0; for (size_t i = 0; i < N; ++i) { for (size_t row = 0; row < m; ++row) { - const size_t current_row_index = i * m + row; - const size_t row_length = _r_host(row + 1) - _r_host(row); - rowOffsets_host(current_row_index + 1) = - rowOffsets_host(current_row_index) + row_length; + const size_t current_row_index = i * m + row; + const size_t row_length = _r_host(row + 1) - _r_host(row); + rowOffsets_host(current_row_index + 1) = rowOffsets_host(current_row_index) + row_length; for (size_t nnz_row = 0; nnz_row < row_length; ++nnz_row) { - const size_t current_block_nnz_index = - rowOffsets_host(current_row_index) + nnz_row; - const size_t current_block_col_index = - _c_host(_r_host(row) + nnz_row) + i * m; + const size_t current_block_nnz_index = rowOffsets_host(current_row_index) + nnz_row; + const size_t current_block_col_index = _c_host(_r_host(row) + nnz_row) + i * m; colIndices_host(current_block_nnz_index) = current_block_col_index; } } @@ -204,12 +191,10 @@ struct Functor_Test_Block_SparseCuSolveQR { auto b = _B.data(); auto x = _X.data(); - cusolverSpDcsrlsvqr(handle, block_m, block_nnz, descrA, csrValA, - rowOffsets.data(), colIndices.data(), b, tol, reorder, - x, singularity); + cusolverSpDcsrlsvqr(handle, block_m, block_nnz, descrA, csrValA, rowOffsets.data(), colIndices.data(), b, tol, + reorder, x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; exec_space().fence(); double sec = timer.seconds(); @@ -219,8 +204,7 @@ struct Functor_Test_Block_SparseCuSolveQR { } }; -template +template struct Functor_Test_SparseCuSolveChol { const MatrixViewType _A; const IntView _r; @@ -229,8 +213,7 @@ struct Functor_Test_SparseCuSolveChol { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_SparseCuSolveChol(const MatrixViewType &A, const IntView &r, - const IntView &c, const VectorViewType &X, + Functor_Test_SparseCuSolveChol(const MatrixViewType &A, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} @@ -247,10 +230,8 @@ struct Functor_Test_SparseCuSolveChol { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -263,10 +244,8 @@ struct Functor_Test_SparseCuSolveChol { auto b = Kokkos::subview(_B, i, Kokkos::ALL).data(); auto x = Kokkos::subview(_X, i, Kokkos::ALL).data(); - cusolverSpDcsrlsvchol(handle, m, nnz, descrA, csrValA, _r.data(), - _c.data(), b, tol, reorder, x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + cusolverSpDcsrlsvchol(handle, m, nnz, descrA, csrValA, _r.data(), _c.data(), b, tol, reorder, x, singularity); + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; } exec_space().fence(); @@ -277,8 +256,7 @@ struct Functor_Test_SparseCuSolveChol { } }; -template +template struct Functor_Test_Block_SparseCuSolveChol { const MatrixViewType _A; const IntView _r; @@ -287,10 +265,8 @@ struct Functor_Test_Block_SparseCuSolveChol { const VectorViewType _B; KOKKOS_INLINE_FUNCTION - Functor_Test_Block_SparseCuSolveChol(const MatrixViewType &A, - const IntView &r, const IntView &c, - const VectorViewType &X, - const VectorViewType &B) + Functor_Test_Block_SparseCuSolveChol(const MatrixViewType &A, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B) : _A(A), _r(r), _c(c), _X(X), _B(B) {} inline double run() { @@ -309,10 +285,8 @@ struct Functor_Test_Block_SparseCuSolveChol { cusparseMatDescr_t descrA = 0; KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); double tol = 1e-18; int reorder = 0; @@ -334,15 +308,12 @@ struct Functor_Test_Block_SparseCuSolveChol { rowOffsets_host(0) = 0; for (size_t i = 0; i < N; ++i) { for (size_t row = 0; row < m; ++row) { - const size_t current_row_index = i * m + row; - const size_t row_length = _r_host(row + 1) - _r_host(row); - rowOffsets_host(current_row_index + 1) = - rowOffsets_host(current_row_index) + row_length; + const size_t current_row_index = i * m + row; + const size_t row_length = _r_host(row + 1) - _r_host(row); + rowOffsets_host(current_row_index + 1) = rowOffsets_host(current_row_index) + row_length; for (size_t nnz_row = 0; nnz_row < row_length; ++nnz_row) { - const size_t current_block_nnz_index = - rowOffsets_host(current_row_index) + nnz_row; - const size_t current_block_col_index = - _c_host(_r_host(row) + nnz_row) + i * m; + const size_t current_block_nnz_index = rowOffsets_host(current_row_index) + nnz_row; + const size_t current_block_col_index = _c_host(_r_host(row) + nnz_row) + i * m; colIndices_host(current_block_nnz_index) = current_block_col_index; } } @@ -358,11 +329,9 @@ struct Functor_Test_Block_SparseCuSolveChol { auto b = _B.data(); auto x = _X.data(); - cusolverSpDcsrlsvchol(handle, block_m, block_nnz, descrA, csrValA, - rowOffsets.data(), colIndices.data(), b, tol, reorder, - x, singularity); - if (singularity[0] != -1) - std::cout << " Error ! " << singularity[0] << " " << m << std::endl; + cusolverSpDcsrlsvchol(handle, block_m, block_nnz, descrA, csrValA, rowOffsets.data(), colIndices.data(), b, tol, + reorder, x, singularity); + if (singularity[0] != -1) std::cout << " Error ! " << singularity[0] << " " << m << std::endl; exec_space().fence(); double sec = timer.seconds(); @@ -407,12 +376,9 @@ int main(int argc, char *argv[]) { if (token == std::string("-X")) name_X = argv[++i]; if (token == std::string("-timers")) name_timer = argv[++i]; if (token == std::string("-team_size")) team_size = std::atoi(argv[++i]); - if (token == std::string("-vector_length")) - vector_length = std::atoi(argv[++i]); - if (token == std::string("-n_implementations")) - n_impl = std::atoi(argv[++i]); - if (token == std::string("-implementation")) - impls.push_back(std::atoi(argv[++i])); + if (token == std::string("-vector_length")) vector_length = std::atoi(argv[++i]); + if (token == std::string("-n_implementations")) n_impl = std::atoi(argv[++i]); + if (token == std::string("-implementation")) impls.push_back(std::atoi(argv[++i])); if (token == std::string("-l")) { layout_left = true; layout_right = false; @@ -437,8 +403,7 @@ int main(int argc, char *argv[]) { constexpr size_t LLC_CAPACITY = 80 * 6 * 1024 * 1024; KokkosBatched::Flush flush; - printf(" :::: CusolverSp Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, - Blk, vector_length, n_rep_1); + printf(" :::: CusolverSp Testing (N = %d, Blk = %d, vl = %d, n = %d)\n", N, Blk, vector_length, n_rep_1); typedef Kokkos::LayoutRight LR; typedef Kokkos::LayoutLeft LL; @@ -460,10 +425,8 @@ int main(int argc, char *argv[]) { XYTypeLL xLL("values", N, Blk); XYTypeLL yLL("values", N, Blk); - if (layout_left) - printf(" :::: Testing left layout (team_size = %d)\n", team_size); - if (layout_right) - printf(" :::: Testing right layout (team_size = %d)\n", team_size); + if (layout_left) printf(" :::: Testing left layout (team_size = %d)\n", team_size); + if (layout_right) printf(" :::: Testing right layout (team_size = %d)\n", team_size); if (layout_left) { readCRSFromMM(name_A, valuesLL, rowOffsets, colIndices); @@ -490,34 +453,28 @@ int main(int argc, char *argv[]) { if (i_impl == 0) { if (layout_right) { - t_spmv = Functor_Test_SparseCuSolveQR( - valuesLR, rowOffsets, colIndices, xLR, yLR) + t_spmv = Functor_Test_SparseCuSolveQR(valuesLR, rowOffsets, + colIndices, xLR, yLR) .run(); } } if (i_impl == 1) { if (layout_right) { - t_spmv = - Functor_Test_SparseCuSolveChol( - valuesLR, rowOffsets, colIndices, xLR, yLR) - .run(); + t_spmv = Functor_Test_SparseCuSolveChol( + valuesLR, rowOffsets, colIndices, xLR, yLR) + .run(); } } if (i_impl == 2) { if (layout_right) { - t_spmv = - Functor_Test_Block_SparseCuSolveQR( - valuesLR, rowOffsets, colIndices, xLR, yLR) - .run(); + t_spmv = Functor_Test_Block_SparseCuSolveQR( + valuesLR, rowOffsets, colIndices, xLR, yLR) + .run(); } } if (i_impl == 3) { if (layout_right) { - t_spmv = Functor_Test_Block_SparseCuSolveChol< - exec_space, AMatrixValueViewLR, IntView, XYTypeLR>( + t_spmv = Functor_Test_Block_SparseCuSolveChol( valuesLR, rowOffsets, colIndices, xLR, yLR) .run(); } @@ -533,10 +490,8 @@ int main(int argc, char *argv[]) { { std::ofstream myfile; std::string name; - if (layout_left) - name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; - if (layout_right) - name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; + if (layout_left) name = name_timer + "_" + std::to_string(i_impl) + "_left.txt"; + if (layout_right) name = name_timer + "_" + std::to_string(i_impl) + "_right.txt"; myfile.open(name); @@ -549,15 +504,10 @@ int main(int argc, char *argv[]) { double average_time = 0.; - for (size_t i = 0; i < timers.size(); ++i) - average_time += timers[i] / timers.size(); + for (size_t i = 0; i < timers.size(); ++i) average_time += timers[i] / timers.size(); - if (layout_left) - printf("Left layout: Implementation %d: solve time = %f\n", i_impl, - average_time); - if (layout_right) - printf("Right layout: Implementation %d: solve time = %f\n", i_impl, - average_time); + if (layout_left) printf("Left layout: Implementation %d: solve time = %f\n", i_impl, average_time); + if (layout_right) printf("Right layout: Implementation %d: solve time = %f\n", i_impl, average_time); if (layout_left) { writeArrayToMM(name_X + std::to_string(i_impl) + "_l.mm", xLL); diff --git a/perf_test/blas/KokkosBlas_blas1.cpp b/perf_test/blas/KokkosBlas_blas1.cpp index 52d2cd4b42..b9471dee37 100644 --- a/perf_test/blas/KokkosBlas_blas1.cpp +++ b/perf_test/blas/KokkosBlas_blas1.cpp @@ -40,8 +40,7 @@ RCP